diff --git a/Compiler/src/exastencils/base/ir/IR_Reduction.scala b/Compiler/src/exastencils/base/ir/IR_Reduction.scala index ea6e67008..69c63ded9 100644 --- a/Compiler/src/exastencils/base/ir/IR_Reduction.scala +++ b/Compiler/src/exastencils/base/ir/IR_Reduction.scala @@ -21,4 +21,5 @@ package exastencils.base.ir /// IR_Reduction // FIXME: op as BinOp -case class IR_Reduction(var op : String, var target : IR_Expression, var targetName : String, var skipMpi : Boolean = false) extends IR_Node +case class IR_Reduction(var op : String, var target : IR_Expression, var targetName : String, + var skipMpi : Boolean = false, var skipOpenMP : Boolean = false) extends IR_Node diff --git a/Compiler/src/exastencils/baseExt/ir/IR_LoopOverDimensions.scala b/Compiler/src/exastencils/baseExt/ir/IR_LoopOverDimensions.scala index 1bf6669d9..e4d59609f 100644 --- a/Compiler/src/exastencils/baseExt/ir/IR_LoopOverDimensions.scala +++ b/Compiler/src/exastencils/baseExt/ir/IR_LoopOverDimensions.scala @@ -28,6 +28,7 @@ import exastencils.datastructures._ import exastencils.logger.Logger import exastencils.optimization.ir._ import exastencils.parallelization.ir._ +import exastencils.util.ir.IR_FragmentLoopCollector // FIXME: refactor object IR_LoopOverDimensions { @@ -148,20 +149,8 @@ case class IR_LoopOverDimensions( } } - def parallelizationIsReasonable : Boolean = { - val maxItCount = maxIterationCount() - if (maxItCount == null) - return true // cannot determine iteration count, default is no change in parallelizability, i.e. true - - var totalNumPoints : Long = 1 - for (i <- maxItCount) - totalNumPoints *= i - totalNumPoints > Knowledge.omp_minWorkItemsPerThread * Knowledge.omp_numThreads - } - def explParLoop = lcCSEApplied && parallelization.potentiallyParallel && - Knowledge.omp_enabled && Knowledge.omp_parallelizeLoopOverDimensions && - parallelizationIsReasonable + Knowledge.omp_enabled && parallelizationOverDimensionsIsReasonable(maxIterationCount()) def createOMPThreadsWrapper(body : ListBuffer[IR_Statement]) : ListBuffer[IR_Statement] = { if (explParLoop) { @@ -214,9 +203,9 @@ case class IR_LoopOverDimensions( nju } - def expandSpecial() : ListBuffer[IR_Statement] = { + def expandSpecial(collector : IR_FragmentLoopCollector) : ListBuffer[IR_Statement] = { def parallelizable(d : Int) = parallelization.potentiallyParallel && parDims.contains(d) - def parallelize(d : Int) = parallelizable(d) && Knowledge.omp_parallelizeLoopOverDimensions && parallelizationIsReasonable + def parallelize(d : Int) = parallelizable(d) && parallelizationOverDimensionsIsReasonable(maxIterationCount()) // TODO: check interaction between at1stIt and condition (see also: TODO in polyhedron.Extractor.enterLoop) var wrappedBody : ListBuffer[IR_Statement] = body @@ -250,6 +239,16 @@ case class IR_LoopOverDimensions( wrappedBody = ListBuffer[IR_Statement](loop) } + // propagate parallelization hints to enclosing fragment loop if parallel + if (Knowledge.omp_parallelizeLoopOverFragments && collector.getEnclosingFragmentLoop().isDefined) { + collector.getEnclosingFragmentLoop().get match { + case fragLoop : IR_LoopOverFragments => + fragLoop.parallelization.parallelizationReasonable &&= parallelizationOverFragmentsIsReasonable(maxIterationCount()) + case fragLoop @ IR_ForLoop(IR_VariableDeclaration(_, name, _, _), _, _, _, _) if name == IR_LoopOverFragments.defIt.name => + fragLoop.parallelization.parallelizationReasonable &&= parallelizationOverFragmentsIsReasonable(maxIterationCount()) + } + } + wrappedBody = createOMPThreadsWrapper(wrappedBody) wrappedBody @@ -259,7 +258,11 @@ case class IR_LoopOverDimensions( /// IR_ResolveLoopOverDimensions object IR_ResolveLoopOverDimensions extends DefaultStrategy("Resolve LoopOverDimensions nodes") { + var collector = new IR_FragmentLoopCollector + this.register(collector) + this.onBefore = () => this.resetCollectors() + this += new Transformation("Resolve", { - case loop : IR_LoopOverDimensions => loop.expandSpecial() + case loop : IR_LoopOverDimensions => loop.expandSpecial(collector) }) } diff --git a/Compiler/src/exastencils/baseExt/ir/IR_LoopOverFragments.scala b/Compiler/src/exastencils/baseExt/ir/IR_LoopOverFragments.scala index 764dbd761..cbe7c208d 100644 --- a/Compiler/src/exastencils/baseExt/ir/IR_LoopOverFragments.scala +++ b/Compiler/src/exastencils/baseExt/ir/IR_LoopOverFragments.scala @@ -45,6 +45,10 @@ case class IR_LoopOverFragments( // TODO: separate omp and potentiallyParallel parallelization.potentiallyParallel = Knowledge.omp_enabled && Knowledge.omp_parallelizeLoopOverFragments && parallelization.potentiallyParallel + // if there is no loop found, we determine here if omp parallelization is reasonable + if (Knowledge.omp_enabled && Knowledge.omp_parallelizeLoopOverFragments & !body.exists(_.isInstanceOf[IR_HasParallelizationInfo])) + parallelization.potentiallyParallel &&= parallelizationOverFragmentsIsReasonable(Array(Knowledge.domain_numFragmentsPerBlock)) + val loop = IR_ForLoop( IR_VariableDeclaration(defIt, 0), IR_Lower(defIt, Knowledge.domain_numFragmentsPerBlock), diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_Flags.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_Flags.scala index 55f73bb69..616dd5bd4 100644 --- a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_Flags.scala +++ b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_Flags.scala @@ -23,9 +23,33 @@ import exastencils.base.ir._ import exastencils.baseExt.ir._ import exastencils.communication.ir._ import exastencils.config._ +import exastencils.domain.ir.IR_IV_IsValidForDomain import exastencils.field.ir._ import exastencils.prettyprinting._ +/// CUDA_DirtyFlagHelper + +object CUDA_DirtyFlagHelper { + def fragmentIdxIsValid(fragIdx : IR_Expression, domainIdx : IR_Expression) = { + if (fragIdx != IR_LoopOverFragments.defIt) + fragIdx >= 0 AndAnd fragIdx < Knowledge.domain_numFragmentsPerBlock AndAnd IR_IV_IsValidForDomain(domainIdx, fragIdx) + else + IR_IV_IsValidForDomain(domainIdx, fragIdx) + } +} + +/// CUDA_DirtyFlagCase + +object CUDA_DirtyFlagCase extends Enumeration { + type Access = Value + final val ANNOT : String = "DirtyFlagCase" + + // CLEAR : field/buffer was not updated -> no transfer needed + // INTERMEDIATE: field/buffer was updated -> possibly need to wait for event before setting to DIRTY + // DIRTY : field/buffer was updated -> transfer needed if execution hardware changes + final val CLEAR, INTERMEDIATE, DIRTY = Value +} + /// CUDA_HostDataUpdated // TODO: move to communication package? @@ -35,7 +59,14 @@ case class CUDA_HostDataUpdated(override var field : IR_Field, override var slot override def usesFieldArrays : Boolean = !Knowledge.data_useFieldNamesAsIdx override def resolveName() = s"hostDataUpdated" + resolvePostfix(fragmentIdx.prettyprint, "", if (Knowledge.data_useFieldNamesAsIdx) field.name else field.index.toString, field.level.toString, "") - override def resolveDefValue() = Some(true) + override def resolveDefValue() = Some(CUDA_DirtyFlagCase.DIRTY.id) + + override def resolveDatatype() = { + if (field.numSlots > 1) + IR_ArrayDatatype(IR_IntegerDatatype, field.numSlots) + else + IR_IntegerDatatype + } } /// CUDA_DeviceDataUpdated @@ -47,7 +78,14 @@ case class CUDA_DeviceDataUpdated(override var field : IR_Field, override var sl override def usesFieldArrays : Boolean = !Knowledge.data_useFieldNamesAsIdx override def resolveName() = s"deviceDataUpdated" + resolvePostfix(fragmentIdx.prettyprint, "", if (Knowledge.data_useFieldNamesAsIdx) field.name else field.index.toString, field.level.toString, "") - override def resolveDefValue() = Some(false) + override def resolveDefValue() = Some(CUDA_DirtyFlagCase.CLEAR.id) + + override def resolveDatatype() = { + if (field.numSlots > 1) + IR_ArrayDatatype(IR_IntegerDatatype, field.numSlots) + else + IR_IntegerDatatype + } } /// CUDA_HostBufferDataUpdated @@ -57,8 +95,8 @@ case class CUDA_HostBufferDataUpdated(var field : IR_Field, var direction : Stri override def prettyprint(out : PpStream) : Unit = out << resolveAccess(resolveName(), fragmentIdx, IR_NullExpression, field.index, field.level, neighIdx) override def resolveName() = s"hostBufferDataUpdated_$direction" + resolvePostfix(fragmentIdx.prettyprint, "", field.index.toString, field.level.toString, neighIdx.prettyprint) - override def resolveDatatype() = IR_BooleanDatatype - override def resolveDefValue() = Some(false) + override def resolveDatatype() = IR_IntegerDatatype + override def resolveDefValue() = Some(CUDA_DirtyFlagCase.CLEAR.id) } /// CUDA_DeviceBufferDataUpdated @@ -68,8 +106,8 @@ case class CUDA_DeviceBufferDataUpdated(var field : IR_Field, var direction : St override def prettyprint(out : PpStream) : Unit = out << resolveAccess(resolveName(), fragmentIdx, IR_NullExpression, field.index, field.level, neighIdx) override def resolveName() = s"deviceBufferDataUpdated_$direction" + resolvePostfix(fragmentIdx.prettyprint, "", field.index.toString, field.level.toString, neighIdx.prettyprint) - override def resolveDatatype() = IR_BooleanDatatype - override def resolveDefValue() = Some(false) + override def resolveDatatype() = IR_IntegerDatatype + override def resolveDefValue() = Some(CUDA_DirtyFlagCase.CLEAR.id) } /// CUDA_ExecutionMode diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_HandleFragmentLoops.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_HandleFragmentLoops.scala index 1f5002648..bc9dcf1cb 100644 --- a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_HandleFragmentLoops.scala +++ b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_HandleFragmentLoops.scala @@ -221,28 +221,42 @@ case class CUDA_HandleFragmentLoops( for (access <- fieldAccesses.toSeq.sortBy(_._1)) { val fieldData = access._2 - val transferStream = CUDA_TransferStream(fieldData.field, Duplicate(fieldData.fragmentIdx)) + val field = fieldData.field + val fragIdx = fieldData.fragmentIdx + val domainIdx = field.domain.index + val transferStream = CUDA_TransferStream(field, Duplicate(fragIdx)) // add data sync statements if (syncBeforeHost(access._1, fieldAccesses.keys)) beforeHost += CUDA_UpdateHostData(Duplicate(fieldData), transferStream).expand().inner // expand here to avoid global expand afterwards // update flags for written fields - if (syncAfterHost(access._1, fieldAccesses.keys)) - afterHost += IR_Assignment(CUDA_HostDataUpdated(fieldData.field, Duplicate(fieldData.slot), Duplicate(fieldData.fragmentIdx)), IR_BooleanConstant(true)) + if (syncAfterHost(access._1, fieldAccesses.keys)) { + val dirtyFlag = CUDA_HostDataUpdated(field, Duplicate(fieldData.slot), Duplicate(fragIdx)) + val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx) + afterHost += IR_IfCondition(isValid AndAnd (dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id), + IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.DIRTY.id)) + } } for (access <- bufferAccesses.toSeq.sortBy(_._1)) { val buffer = access._2 - val transferStream = CUDA_TransferStream(buffer.field, Duplicate(buffer.fragmentIdx)) + val field = buffer.field + val fragIdx = buffer.fragmentIdx + val domainIdx = field.domain.index + val transferStream = CUDA_TransferStream(field, Duplicate(fragIdx)) // add buffer sync statements if (syncBeforeHost(access._1, bufferAccesses.keys)) beforeHost += CUDA_UpdateHostBufferData(Duplicate(buffer), transferStream).expand().inner // expand here to avoid global expand afterwards // update flags for written buffers - if (syncAfterHost(access._1, bufferAccesses.keys)) - afterHost += IR_Assignment(CUDA_HostBufferDataUpdated(buffer.field, buffer.direction, Duplicate(buffer.neighIdx), Duplicate(buffer.fragmentIdx)), IR_BooleanConstant(true)) + if (syncAfterHost(access._1, bufferAccesses.keys)) { + val dirtyFlag = CUDA_HostBufferDataUpdated(field, buffer.direction, Duplicate(buffer.neighIdx), Duplicate(fragIdx)) + val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx) + afterHost += IR_IfCondition(isValid AndAnd (dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id), + IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.DIRTY.id)) + } } // - device sync stmts - @@ -250,27 +264,41 @@ case class CUDA_HandleFragmentLoops( if (isParallel) { for (access <- fieldAccesses.toSeq.sortBy(_._1)) { val fieldData = access._2 - val transferStream = CUDA_TransferStream(fieldData.field, Duplicate(fieldData.fragmentIdx)) + val field = fieldData.field + val fragIdx = fieldData.fragmentIdx + val domainIdx = field.domain.index + val transferStream = CUDA_TransferStream(field, Duplicate(fragIdx)) // add data sync statements if (syncBeforeDevice(access._1, fieldAccesses.keys)) beforeDevice += CUDA_UpdateDeviceData(Duplicate(fieldData), transferStream).expand().inner // expand here to avoid global expand afterwards // update flags for written fields - if (syncAfterDevice(access._1, fieldAccesses.keys)) - afterDevice += IR_Assignment(CUDA_DeviceDataUpdated(fieldData.field, Duplicate(fieldData.slot), Duplicate(fieldData.fragmentIdx)), IR_BooleanConstant(true)) + if (syncAfterDevice(access._1, fieldAccesses.keys)) { + val dirtyFlag = CUDA_DeviceDataUpdated(field, Duplicate(fieldData.slot), Duplicate(fragIdx)) + val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx) + afterDevice += IR_IfCondition(isValid AndAnd dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id, + IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.DIRTY.id)) + } } for (access <- bufferAccesses.toSeq.sortBy(_._1)) { val buffer = access._2 - val transferStream = CUDA_TransferStream(buffer.field, Duplicate(buffer.fragmentIdx)) + val field = buffer.field + val fragIdx = buffer.fragmentIdx + val domainIdx = field.domain.index + val transferStream = CUDA_TransferStream(field, Duplicate(fragIdx)) // add data sync statements if (syncBeforeDevice(access._1, bufferAccesses.keys)) beforeDevice += CUDA_UpdateDeviceBufferData(Duplicate(buffer), transferStream).expand().inner // expand here to avoid global expand afterwards // update flags for written fields - if (syncAfterDevice(access._1, bufferAccesses.keys)) - afterDevice += IR_Assignment(CUDA_DeviceBufferDataUpdated(buffer.field, buffer.direction, Duplicate(buffer.neighIdx), Duplicate(buffer.fragmentIdx)), IR_BooleanConstant(true)) + if (syncAfterDevice(access._1, bufferAccesses.keys)) { + val dirtyFlag = CUDA_DeviceBufferDataUpdated(field, buffer.direction, Duplicate(buffer.neighIdx), Duplicate(fragIdx)) + val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx) + afterDevice += IR_IfCondition(isValid AndAnd (dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id), + IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.DIRTY.id)) + } } } @@ -305,8 +333,11 @@ case class CUDA_HandleFragmentLoops( val redTarget = Duplicate(red.target) // move reduction towards "synchroFragLoop" - // -> OpenMP/MPI reduction occurs after accumulation in "synchroFragLoop" - loop.parallelization.reduction = None + // -> MPI reduction occurs after accumulation in "synchroFragLoop" (i.e. skip in enclosing frag loop and its inner dimension loop) + // -> OMP reduction occurs only for parallelization over IR_LoopOverDimensions, otherwise skipped as MPI reduction + loop.parallelization.reduction.get.skipMpi = true + loop.parallelization.reduction.get.skipOpenMP = !Knowledge.omp_parallelizeLoopOverDimensions + syncAfterFragLoop.parallelization.reduction = Some(red) // force comp stream sync if comp kernels are not synced explicitly diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_MemoryTransfer.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_MemoryTransfer.scala index 060ddd3c5..ef6096c5b 100644 --- a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_MemoryTransfer.scala +++ b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_MemoryTransfer.scala @@ -82,11 +82,10 @@ case class CUDA_UpdateHostData(var fieldData : IR_IV_FieldData, stream : CUDA_Tr override def expand() : Output[IR_Statement] = { val field = fieldData.field val fragIdx = fieldData.fragmentIdx - val isDirty = CUDA_DeviceDataUpdated(field, Duplicate(fieldData.slot), Duplicate(fragIdx)) - val isValid = if (fragIdx != IR_LoopOverFragments.defIt) - fragIdx >= 0 AndAnd fragIdx < Knowledge.domain_numFragmentsPerBlock AndAnd IR_IV_IsValidForDomain(field.domain.index, fragIdx) - else - IR_IV_IsValidForDomain(field.domain.index, fragIdx) + val domainIdx = field.domain.index + val flag = CUDA_DeviceDataUpdated(field, Duplicate(fieldData.slot), Duplicate(fragIdx)) + val isDirty = flag EqEq CUDA_DirtyFlagCase.DIRTY.id + val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx) if (Knowledge.cuda_useZeroCopy || List("both", "device_to_host").contains(Knowledge.cuda_eliminate_memory_transfers)) return IR_IfCondition(IR_Negation(CUDA_IssuedSyncForEliminatedTransfer()) AndAnd isValid AndAnd isDirty, @@ -102,6 +101,7 @@ case class CUDA_UpdateHostData(var fieldData : IR_IV_FieldData, stream : CUDA_Tr (0 until field.layout.numDimsData).map(dim => field.layout.idxById("TOT", dim)).reduceLeft(_ * _) * IR_SizeOf(field.resolveBaseDatatype), "D2H", stream), + IR_Assignment(flag, CUDA_DirtyFlagCase.INTERMEDIATE.id), CUDA_EventRecord(CUDA_PendingStreamTransfers(field, fragIdx), stream))) } } @@ -117,11 +117,10 @@ case class CUDA_UpdateDeviceData(var fieldData : IR_IV_FieldData, stream : CUDA_ override def expand() : Output[IR_Statement] = { val field = fieldData.field val fragIdx = fieldData.fragmentIdx - val isDirty = CUDA_HostDataUpdated(field, Duplicate(fieldData.slot), Duplicate(fragIdx)) - val isValid = if (fragIdx != IR_LoopOverFragments.defIt) - fragIdx >= 0 AndAnd fragIdx < Knowledge.domain_numFragmentsPerBlock AndAnd IR_IV_IsValidForDomain(field.domain.index, fragIdx) - else - IR_IV_IsValidForDomain(field.domain.index, fragIdx) + val domainIdx = field.domain.index + val flag = CUDA_HostDataUpdated(field, Duplicate(fieldData.slot), Duplicate(fragIdx)) + val isDirty = flag EqEq CUDA_DirtyFlagCase.DIRTY.id + val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx) if (Knowledge.cuda_useZeroCopy || List("both", "host_to_device").contains(Knowledge.cuda_eliminate_memory_transfers)) return IR_IfCondition(IR_Negation(CUDA_IssuedSyncForEliminatedTransfer()) AndAnd isValid AndAnd isDirty, @@ -137,6 +136,7 @@ case class CUDA_UpdateDeviceData(var fieldData : IR_IV_FieldData, stream : CUDA_ (0 until field.layout.numDimsData).map(dim => field.layout.idxById("TOT", dim)).reduceLeft(_ * _) * IR_SizeOf(field.resolveBaseDatatype), "H2D", stream), + IR_Assignment(flag, CUDA_DirtyFlagCase.INTERMEDIATE.id), CUDA_EventRecord(CUDA_PendingStreamTransfers(field, fragIdx), stream))) } } @@ -146,15 +146,19 @@ case class CUDA_UpdateDeviceData(var fieldData : IR_IV_FieldData, stream : CUDA_ case class CUDA_UpdateHostBufferData(var buffer : IR_IV_CommBuffer, stream : CUDA_TransferStream) extends CUDA_HostStatement with IR_Expandable { override def expand() : Output[IR_Statement] = { val field = buffer.field - val isDirty = CUDA_DeviceBufferDataUpdated(field, buffer.direction, Duplicate(buffer.neighIdx)) + val fragIdx = buffer.fragmentIdx + val domainIdx = field.domain.index + val flag = CUDA_DeviceBufferDataUpdated(field, buffer.direction, Duplicate(buffer.neighIdx)) + val isDirty = flag EqEq CUDA_DirtyFlagCase.DIRTY.id + val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx) if (Knowledge.cuda_useZeroCopy || List("both", "device_to_host").contains(Knowledge.cuda_eliminate_memory_transfers)) - return IR_IfCondition(IR_AndAnd(IR_Negation(CUDA_IssuedSyncForEliminatedTransfer()), isDirty), + return IR_IfCondition(IR_Negation(CUDA_IssuedSyncForEliminatedTransfer()) AndAnd isDirty AndAnd isValid, ListBuffer[IR_Statement]( CUDA_DeviceSynchronize(), IR_Assignment(CUDA_IssuedSyncForEliminatedTransfer(), true))) - IR_IfCondition(isDirty, + IR_IfCondition(isValid AndAnd isDirty, ListBuffer[IR_Statement]( CUDA_TransferUtil.genTransfer( Duplicate(buffer), @@ -162,7 +166,8 @@ case class CUDA_UpdateHostBufferData(var buffer : IR_IV_CommBuffer, stream : CUD Duplicate(buffer.size) * IR_SizeOf(field.resolveBaseDatatype), "D2H", stream), - CUDA_EventRecord(CUDA_PendingStreamTransfers(field, buffer.fragmentIdx), stream))) + IR_Assignment(flag, CUDA_DirtyFlagCase.INTERMEDIATE.id), + CUDA_EventRecord(CUDA_PendingStreamTransfers(field, fragIdx), stream))) } } @@ -171,15 +176,19 @@ case class CUDA_UpdateHostBufferData(var buffer : IR_IV_CommBuffer, stream : CUD case class CUDA_UpdateDeviceBufferData(var buffer : IR_IV_CommBuffer, stream : CUDA_TransferStream) extends CUDA_HostStatement with IR_Expandable { override def expand() : Output[IR_Statement] = { val field = buffer.field - val isDirty = CUDA_HostBufferDataUpdated(field, buffer.direction, Duplicate(buffer.neighIdx)) + val fragIdx = buffer.fragmentIdx + val domainIdx = field.domain.index + val flag = CUDA_HostBufferDataUpdated(field, buffer.direction, Duplicate(buffer.neighIdx)) + val isDirty = flag EqEq CUDA_DirtyFlagCase.DIRTY.id + val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx) if (Knowledge.cuda_useZeroCopy || List("both", "host_to_device").contains(Knowledge.cuda_eliminate_memory_transfers)) - return IR_IfCondition(IR_AndAnd(IR_Negation(CUDA_IssuedSyncForEliminatedTransfer()), isDirty), + return IR_IfCondition(IR_Negation(CUDA_IssuedSyncForEliminatedTransfer()) AndAnd isDirty AndAnd isValid, ListBuffer[IR_Statement]( CUDA_DeviceSynchronize(), IR_Assignment(CUDA_IssuedSyncForEliminatedTransfer(), true))) - IR_IfCondition(isDirty, + IR_IfCondition(isValid AndAnd isDirty, ListBuffer[IR_Statement]( CUDA_TransferUtil.genTransfer( Duplicate(buffer), @@ -187,6 +196,7 @@ case class CUDA_UpdateDeviceBufferData(var buffer : IR_IV_CommBuffer, stream : C Duplicate(buffer.size) * IR_SizeOf(field.resolveBaseDatatype), "H2D", stream), - CUDA_EventRecord(CUDA_PendingStreamTransfers(field, buffer.fragmentIdx), stream))) + IR_Assignment(flag, CUDA_DirtyFlagCase.INTERMEDIATE.id), + CUDA_EventRecord(CUDA_PendingStreamTransfers(field, fragIdx), stream))) } } diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareFragmentLoops.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareFragmentLoops.scala index cc63a79f9..45272c0f1 100644 --- a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareFragmentLoops.scala +++ b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareFragmentLoops.scala @@ -4,6 +4,7 @@ import scala.collection.mutable import scala.collection.mutable.ListBuffer import exastencils.base.ir._ +import exastencils.base.ir.IR_ImplicitConversion._ import exastencils.baseExt.ir._ import exastencils.communication.ir.IR_IV_CommBuffer import exastencils.core.Duplicate @@ -104,26 +105,62 @@ trait CUDA_PrepareFragmentLoops extends CUDA_PrepareBufferSync with CUDA_Executi val fieldData = access._2 if (syncBeforeHost(access._1, fieldAccesses.keys)) { val dirtyFlag = CUDA_DeviceDataUpdated(fieldData.field, Duplicate(fieldData.slot), Duplicate(fieldData.fragmentIdx)) - beforeHost += IR_IfCondition(dirtyFlag, + beforeHost += IR_IfCondition(dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id, ListBuffer[IR_Statement]( CUDA_WaitEvent(CUDA_PendingStreamTransfers(fieldData.field, Duplicate(fieldData.fragmentIdx)), stream, "D2H"), - IR_Assignment(dirtyFlag, IR_BooleanConstant(false)))) + IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.CLEAR.id))) } } for (access <- bufferAccesses.toSeq.sortBy(_._1)) { val buffer = access._2 if (syncBeforeHost(access._1, bufferAccesses.keys)) { val dirtyFlag = CUDA_DeviceBufferDataUpdated(buffer.field, buffer.direction, Duplicate(buffer.neighIdx)) - beforeHost += IR_IfCondition(dirtyFlag, + beforeHost += IR_IfCondition(dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id, ListBuffer[IR_Statement]( CUDA_WaitEvent(CUDA_PendingStreamTransfers(buffer.field, Duplicate(buffer.fragmentIdx)), stream, "D2H"), - IR_Assignment(dirtyFlag, IR_BooleanConstant(false)))) + IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.CLEAR.id))) } } beforeHost } + def syncFlagsAfterHost() = { + var afterHost : ListBuffer[IR_Statement] = ListBuffer() + + // update flags for written fields/buffers (check for valid fragment already implicitly done in surrounding loop) + for (access <- fieldAccesses.toSeq.sortBy(_._1)) { + if (syncAfterHost(access._1, fieldAccesses.keys)) + afterHost += IR_Assignment(CUDA_HostDataUpdated(access._2.field, Duplicate(access._2.slot)), + CUDA_DirtyFlagCase.INTERMEDIATE.id) + } + for (access <- bufferAccesses.toSeq.sortBy(_._1)) { + if (syncAfterHost(access._1, bufferAccesses.keys)) + afterHost += IR_Assignment(CUDA_HostBufferDataUpdated(access._2.field, access._2.direction, Duplicate(access._2.neighIdx)), + CUDA_DirtyFlagCase.INTERMEDIATE.id) + } + + afterHost + } + + def syncFlagsAfterDevice() = { + var afterDevice : ListBuffer[IR_Statement] = ListBuffer() + + // update flags for written fields/buffers (check for valid fragment already implicitly done in surrounding loop) + for (access <- fieldAccesses.toSeq.sortBy(_._1)) { + if (syncAfterDevice(access._1, fieldAccesses.keys)) + afterDevice += IR_Assignment(CUDA_DeviceDataUpdated(access._2.field, Duplicate(access._2.slot), access._2.fragmentIdx), + CUDA_DirtyFlagCase.INTERMEDIATE.id) + } + for (access <- bufferAccesses.toSeq.sortBy(_._1)) { + if (syncAfterDevice(access._1, bufferAccesses.keys)) + afterDevice += IR_Assignment(CUDA_DeviceBufferDataUpdated(access._2.field, access._2.direction, Duplicate(access._2.neighIdx), access._2.fragmentIdx), + CUDA_DirtyFlagCase.INTERMEDIATE.id) + } + + afterDevice + } + def syncEventsBeforeDevice(stream : CUDA_Stream) = { var beforeDevice : ListBuffer[IR_Statement] = ListBuffer() @@ -131,21 +168,29 @@ trait CUDA_PrepareFragmentLoops extends CUDA_PrepareBufferSync with CUDA_Executi for (access <- fieldAccesses.toSeq.sortBy(_._1)) { val fieldData = access._2 if (syncBeforeDevice(access._1, fieldAccesses.keys)) { - val dirtyFlag = CUDA_HostDataUpdated(fieldData.field, Duplicate(fieldData.slot), Duplicate(fieldData.fragmentIdx)) - beforeDevice += IR_IfCondition(dirtyFlag, + val field = fieldData.field + val fragIdx = fieldData.fragmentIdx + val domainIdx = field.domain.index + val dirtyFlag = CUDA_HostDataUpdated(field, Duplicate(fieldData.slot), Duplicate(fieldData.fragmentIdx)) + val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx) + beforeDevice += IR_IfCondition(isValid AndAnd (dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id), ListBuffer[IR_Statement]( - CUDA_WaitEvent(CUDA_PendingStreamTransfers(fieldData.field, Duplicate(fieldData.fragmentIdx)), stream, "H2D"), - IR_Assignment(dirtyFlag, IR_BooleanConstant(false)))) + CUDA_WaitEvent(CUDA_PendingStreamTransfers(field, Duplicate(fieldData.fragmentIdx)), stream, "H2D"), + IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.CLEAR.id))) } } for (access <- bufferAccesses.toSeq.sortBy(_._1)) { val buffer = access._2 if (syncBeforeDevice(access._1, bufferAccesses.keys)) { + val field = buffer.field + val fragIdx = buffer.fragmentIdx + val domainIdx = field.domain.index val dirtyFlag = CUDA_HostBufferDataUpdated(buffer.field, buffer.direction, Duplicate(buffer.neighIdx)) - beforeDevice += IR_IfCondition(dirtyFlag, + val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx) + beforeDevice += IR_IfCondition(isValid AndAnd (dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id), ListBuffer[IR_Statement]( CUDA_WaitEvent(CUDA_PendingStreamTransfers(buffer.field, Duplicate(buffer.fragmentIdx)), stream, "H2D"), - IR_Assignment(dirtyFlag, IR_BooleanConstant(false)))) + IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.CLEAR.id))) } } diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareHostCode.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareHostCode.scala index 3efc0bcae..4b2f5453b 100644 --- a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareHostCode.scala +++ b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareHostCode.scala @@ -33,6 +33,7 @@ import exastencils.logger.Logger import exastencils.parallelization.ir.IR_HasParallelizationInfo import exastencils.util.NoDuplicateWrapper import exastencils.util.ir._ +import exastencils.base.ir.IR_ImplicitConversion._ /// CUDA_PrepareHostCode @@ -75,7 +76,7 @@ object CUDA_PrepareHostCode extends DefaultStrategy("Prepare CUDA relevant code bufferAccesses ++= gatherBuffers.bufferAccesses } - def getHostDeviceSyncStmts(body : ListBuffer[IR_Statement], isParallel : Boolean, executionStream: CUDA_Stream) = { + def getHostDeviceSyncStmts(body : ListBuffer[IR_Statement], isParallel : Boolean, executionStream : CUDA_Stream) = { val (beforeHost, afterHost) = (ListBuffer[IR_Statement](), ListBuffer[IR_Statement]()) val (beforeDevice, afterDevice) = (ListBuffer[IR_Statement](), ListBuffer[IR_Statement]()) @@ -86,11 +87,15 @@ object CUDA_PrepareHostCode extends DefaultStrategy("Prepare CUDA relevant code beforeHost ++= syncEventsBeforeHost(executionStream) + afterHost ++= syncFlagsAfterHost() + // device sync stmts if (isParallel) { if (!Knowledge.experimental_cuda_useStreams && !Knowledge.cuda_omitSyncDeviceAfterKernelCalls) afterDevice += CUDA_DeviceSynchronize() + + afterDevice ++= syncFlagsAfterDevice() } beforeDevice ++= syncEventsBeforeDevice(executionStream) @@ -99,7 +104,7 @@ object CUDA_PrepareHostCode extends DefaultStrategy("Prepare CUDA relevant code } // extract estimated times for host/device from performance evaluation strategy (zero if estimation doesn't exist) - def getTimeEstimation(loop: IR_LoopOverDimensions, host: Boolean) = + def getTimeEstimation(loop : IR_LoopOverDimensions, host : Boolean) = loop.getAnnotation(if (host) "perf_timeEstimate_host" else "perf_timeEstimate_device").getOrElse(0.0).asInstanceOf[Double] // use condWrapper to prevent automatic removal of branching from simplification strategies diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareMPICode.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareMPICode.scala index 9c97f6e75..a9d75cf82 100644 --- a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareMPICode.scala +++ b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareMPICode.scala @@ -175,13 +175,17 @@ object CUDA_PrepareMPICode extends DefaultStrategy("Prepare CUDA relevant code b beforeHost ++= syncEventsBeforeHost(executionStream) + afterHost ++= syncFlagsAfterHost() + // device sync stmts - if (!Knowledge.experimental_cuda_useStreams && !Knowledge.cuda_omitSyncDeviceAfterKernelCalls) - afterDevice += CUDA_DeviceSynchronize() + if (!Knowledge.experimental_cuda_useStreams) + beforeDevice += CUDA_DeviceSynchronize() beforeDevice ++= syncEventsBeforeDevice(executionStream) + afterDevice ++= syncFlagsAfterDevice() + (beforeHost, afterHost, beforeDevice, afterDevice) } diff --git a/Compiler/src/exastencils/parallelization/api/omp/OMP_Loop.scala b/Compiler/src/exastencils/parallelization/api/omp/OMP_Loop.scala index 87f9d9e03..77c087a65 100644 --- a/Compiler/src/exastencils/parallelization/api/omp/OMP_Loop.scala +++ b/Compiler/src/exastencils/parallelization/api/omp/OMP_Loop.scala @@ -123,7 +123,7 @@ object OMP_AddParallelSections extends DefaultStrategy("Handle potentially paral case target : IR_ForLoop if target.parallelization.potentiallyParallel && target.parallelization.parallelizationReasonable && !target.hasAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION) => val additionalOMPClauses = ListBuffer[OMP_Clause]() - if (target.parallelization.reduction.isDefined) + if (target.parallelization.reduction.isDefined && !target.parallelization.reduction.get.skipOpenMP) additionalOMPClauses += OMP_Reduction(target.parallelization.reduction.get) if (target.parallelization.privateVars.nonEmpty) diff --git a/Compiler/src/exastencils/parallelization/ir/IR_ParallelizationInfo.scala b/Compiler/src/exastencils/parallelization/ir/IR_ParallelizationInfo.scala index ce3096547..feb2ecc61 100644 --- a/Compiler/src/exastencils/parallelization/ir/IR_ParallelizationInfo.scala +++ b/Compiler/src/exastencils/parallelization/ir/IR_ParallelizationInfo.scala @@ -21,6 +21,7 @@ package exastencils.parallelization.ir import scala.collection.mutable.ListBuffer import exastencils.base.ir._ +import exastencils.config.Knowledge /// IR_ParallelizationInfo @@ -49,4 +50,22 @@ case class IR_ParallelizationInfo( trait IR_HasParallelizationInfo { var parallelization : IR_ParallelizationInfo + + def parallelizationOverDimensionsIsReasonable(maxIterationCount : Array[Long]) : Boolean = if (Knowledge.omp_parallelizeLoopOverDimensions) { + if (maxIterationCount == null) + return true // cannot determine iteration count, default is no change in parallelizability, i.e. true + + maxIterationCount.product > Knowledge.omp_minWorkItemsPerThread * Knowledge.omp_numThreads + } else { + false + } + + def parallelizationOverFragmentsIsReasonable(maxIterationCount : Array[Long]) : Boolean = if (Knowledge.omp_parallelizeLoopOverFragments) { + if (maxIterationCount == null) + return true // cannot determine iteration count, default is no change in parallelizability, i.e. true + + math.max(Knowledge.domain_numFragmentsPerBlock / Knowledge.omp_numThreads, 1) * maxIterationCount.product > Knowledge.omp_minWorkItemsPerThread + } else { + false + } } diff --git a/Compiler/src/exastencils/polyhedron/IR_PolyExtractor.scala b/Compiler/src/exastencils/polyhedron/IR_PolyExtractor.scala index 52b0993ed..d317edd02 100644 --- a/Compiler/src/exastencils/polyhedron/IR_PolyExtractor.scala +++ b/Compiler/src/exastencils/polyhedron/IR_PolyExtractor.scala @@ -373,7 +373,7 @@ class IR_PolyExtractor extends Collector { mergeStmts : Boolean) : Unit = { this.scop_ = new Scop(root, localContext, globalContext, optLevel, - Knowledge.omp_parallelizeLoopOverDimensions && root.parallelizationIsReasonable, root.maxIterationCount()) + Knowledge.omp_parallelizeLoopOverDimensions && root.parallelizationOverDimensionsIsReasonable(root.maxIterationCount()), root.maxIterationCount()) if (mergeWithPrev) scops.last.nextMerge = this.scop_ this.modelLoopVars_ = modelLoopVars @@ -500,7 +500,7 @@ class IR_PolyExtractor extends Collector { if (loop.parallelization.reduction.isDefined) loop.parallelization.reduction.get.annotate(SKIP_ANNOT) // loop.at1stIt is a list of tuple, StateManager does not handle these, so a skip annotation is not required - if (loop.parallelizationIsReasonable && loop.polyOptLevel >= 1) + if (loop.parallelizationOverDimensionsIsReasonable(loop.maxIterationCount()) && loop.polyOptLevel >= 1) enterLoop(loop, merge) case _ => diff --git a/Testing/Misc/reduction.exa4 b/Testing/Misc/reduction.exa4 index da83f5661..cf79bbb71 100644 --- a/Testing/Misc/reduction.exa4 +++ b/Testing/Misc/reduction.exa4 @@ -24,7 +24,6 @@ Function Application { } Var redTarget : Real = 0.0 - loop over mat@finest with reduction (+: redTarget) { redTarget += mat@finest[2][2] } @@ -40,32 +39,32 @@ Function Application { Var redTarget2 : Matrix loop over mat@finest with reduction (+: redTarget2[0][0]) { - redTarget2[0][0] = matrixEntry + redTarget2[0][0] += matrixEntry } - if ( fabs(redTarget2[0][0] - numReds * matrixEntry ) <= eps) { + if ( fabs(redTarget2[0][0] - expected ) <= eps) { print('Passed stage 1: Reduced matrix entry equals the number of reductions times the original value of the entry') } else { - print('Failed stage 1: Reduced matrix entry redTarget2[0][0]=', redTarget2[0][0], ', expected=', numReds * matrixEntry) + print('Failed stage 1: Reduced matrix entry redTarget2[0][0]=', redTarget2[0][0], ', expected=', expected) } loop over fragments with reduction (max: redTarget2[0][0]) { if (fragmentIdx == 0) { - redTarget2[0][0] = maxVal + redTarget2[0][0] = max(maxVal, redTarget2[0][0]) } } if ( fabs(redTarget2[0][0] - maxVal) <= eps ) { - print('Passed stage 2: redTarget2[0][0] equals maxVal') + print('Passed stage 2: Reduced matrix entry redTarget2[0][0] equals maxVal') } else { - print('Failed stage 2: redTarget=', redTarget2[0][0], ', maxVal=', maxVal) + print('Failed stage 2: Reduced matrix entry redTarget2[0][0]=', redTarget2[0][0], ', expected=', maxVal) } Var redTarget3 : Matrix = 0. loop over fragments with reduction (+: redTarget3) { - redTarget3[0] = maxVal - redTarget3[1] = minVal + redTarget3[0] += maxVal + redTarget3[1] += minVal } if ( fabs(redTarget3[0][0] - numReds * maxVal) <= eps && @@ -74,30 +73,30 @@ Function Application { fabs(redTarget3[3][0]) <= eps && fabs(redTarget3[4][0]) <= eps ) { - print('Passed stage 3') + print('Passed stage 3: Reduced matrix matches expected target matrix') } else { print('Failed stage 3: redTarget3[0][0]=', redTarget3[0][0], ', redTarget3[1][0]=', redTarget3[1][0], ', redTarget3[2][0]=', redTarget3[2][0], ', redTarget3[3][0]=', redTarget3[3][0], - ', redTarget3[4][0]=', redTarget3[4][0]) + ', redTarget3[4][0]=', redTarget3[4][0], + '. Expected = {', numReds * maxVal, ',', numReds * minVal, ', 0., 0., 0.}') } - Var redTarget4 : Matrix = {{maxVal}, {minVal}} - Expr numReds2 = getKnowledge ( 'domain_rect_numBlocks_x' ) // no reduction taking place on OpenMP site + Var redTarget4 : Matrix = {{matrixEntry}, {matrixEntry}} - loop over fragments with reduction (+: redTarget4) { - if (sqrt(4) != 2) { - print('dummy line') - } + loop over fragments with reduction (min: redTarget4) { + redTarget4[0][0] = min(redTarget4[0][0], minVal) + redTarget4[1][0] = min(redTarget4[1][0], maxVal) } - if ( fabs(redTarget4[0][0] - numReds2 * maxVal) <= eps && - fabs(redTarget4[1][0] - numReds2 * minVal) <= eps) { + if ( fabs(redTarget4[0][0] - minVal) <= eps && + fabs(redTarget4[1][0] - matrixEntry) <= eps) { - print('Passed stage 4') + print('Passed stage 4: Reduced matrix matches expected target matrix') } else { - print('Failed stage 4: redTarget4[0][0]=', redTarget4[0][0], ', redTarget4[1][0]=', redTarget4[1][0]) + print('Failed stage 4: redTarget4[0][0]=', redTarget4[0][0], ', redTarget4[1][0]=', redTarget4[1][0], + '. Expected: {', minVal, ',', matrixEntry, '}') } destroyGlobals ( ) diff --git a/Testing/Misc/reduction.results b/Testing/Misc/reduction.results index 9fb91e614..27d42aa6c 100644 --- a/Testing/Misc/reduction.results +++ b/Testing/Misc/reduction.results @@ -1,5 +1,5 @@ Passed stage 0: redTarget equals expected result Passed stage 1: Reduced matrix entry equals the number of reductions times the original value of the entry -Passed stage 2: redTarget2[0][0] equals maxVal -Passed stage 3 -Passed stage 4 \ No newline at end of file +Passed stage 2: Reduced matrix entry redTarget2[0][0] equals maxVal +Passed stage 3: Reduced matrix matches expected target matrix +Passed stage 4: Reduced matrix matches expected target matrix \ No newline at end of file