diff --git a/src/runtime/optimizer.cc b/src/runtime/optimizer.cc index fc30ae12cc..cf23cc39a5 100644 --- a/src/runtime/optimizer.cc +++ b/src/runtime/optimizer.cc @@ -522,7 +522,7 @@ void AdamOptimizer::unified_update(std::vector const parameters) } int offset = 0; - printf("param size: %d, %d\n", parameters.size(), parameters_num); + // printf("param size: %d, %d\n", parameters.size(), parameters_num); while(processed_parameters_num < parameters.size()){ @@ -540,7 +540,7 @@ void AdamOptimizer::unified_update(std::vector const parameters) assert(p->parallel_is != IndexSpace::NO_SPACE); } - printf("parameters_num: %d %d, %d\n", parameters_num, reservedWorkSpaceSize, model->handlers->workSpaceSize); + // printf("parameters_num: %d %d, %d\n", parameters_num, reservedWorkSpaceSize, model->handlers->workSpaceSize); assert(parameters_num <= parameters.size()); IndexLauncher launcher(ADAM_UNIFY_UPD_NCCL_TASK_ID, @@ -729,7 +729,12 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, float const *w_grad_ptr[op->parameters_num]; float *w_ptr[op->parameters_num], *v_ptr[op->parameters_num], *m_ptr[op->parameters_num]; - size_t size[op->parameters_num]; + + hipMalloc(w_grad_ptr, sizeof(float*) * op->parameters_num); + hipMalloc(w_ptr, sizeof(float*) * op->parameters_num); + hipMalloc(v_ptr, sizeof(float*) * op->parameters_num); + hipMalloc(m_ptr, sizeof(float*) * op->parameters_num); + size_t *size = new size_t[op->parameters_num]; int offset = 0; printf("parameters_num: %d\n", op->parameters_num); @@ -741,10 +746,11 @@ void AdamOptimizer::nccl_unified_update_task(Task const *task, GenericTensorAccessorW accM = helperGetGenericTensorAccessorWO(DataType::DT_FLOAT, regions[offset+3], task->regions[offset+3], FID_DATA, ctx, runtime); offset += 4; - size[i] = accW.domain.get_volume(); + size[i] = accWGrad.domain.get_volume(); // assert(accWGrad.rect == accW.rect); // assert(accWGrad.rect == accV.rect); // assert(accWGrad.rect == accM.rect); + w_grad_ptr[i] = accWGrad.get_float_ptr(); w_ptr[i] = accW.get_float_ptr(); v_ptr[i] = accV.get_float_ptr(); m_ptr[i] = accM.get_float_ptr(); diff --git a/src/runtime/optimizer_kernel.cpp b/src/runtime/optimizer_kernel.cpp index 5e7e613bc7..a0e3180185 100644 --- a/src/runtime/optimizer_kernel.cpp +++ b/src/runtime/optimizer_kernel.cpp @@ -262,18 +262,11 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *o void *workSpace_ptr = meta->handle.workSpace; for(int i = 0; i < op->parameters_num; i++){ - // hipMemcpyAsync(static_cast(workSpace_ptr), - // w_grad_ptr[i], - // size[i] * sizeof(float), - // hipMemcpyDeviceToDevice, - // stream); - // hipError_t error = hipGetLastError(); - // if(error != hipSuccess) - // { - // // print the CUDA error message and exit - // printf("CUDA error: %s\n", hipGetErrorString(error)); - // } - + hipMemcpyAsync(workSpace_ptr, + w_grad_ptr[i], + size[i] * sizeof(float), + hipMemcpyDeviceToDevice, + stream); workSpace_ptr = static_cast(workSpace_ptr) + size[i] * sizeof(float); } @@ -292,7 +285,7 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(AdamOptimizer const *o float beta2_t = op->beta2_t; for(int i = 0; i < op->parameters_num; i++){ // update - std::cout<<"update"<<"\n"; + // printf("update %d\n", i); hipLaunchKernelGGL(HIP_KERNEL_NAME(adam_update), GET_BLOCKS(size[i]), CUDA_NUM_THREADS,