diff --git a/Compiler/src/exastencils/base/ir/IR_Reduction.scala b/Compiler/src/exastencils/base/ir/IR_Reduction.scala
index ea6e67008..69c63ded9 100644
--- a/Compiler/src/exastencils/base/ir/IR_Reduction.scala
+++ b/Compiler/src/exastencils/base/ir/IR_Reduction.scala
@@ -21,4 +21,5 @@ package exastencils.base.ir
 /// IR_Reduction
 
 // FIXME: op as BinOp
-case class IR_Reduction(var op : String, var target : IR_Expression, var targetName : String, var skipMpi : Boolean = false) extends IR_Node
+case class IR_Reduction(var op : String, var target : IR_Expression, var targetName : String,
+    var skipMpi : Boolean = false, var skipOpenMP : Boolean = false) extends IR_Node
diff --git a/Compiler/src/exastencils/baseExt/ir/IR_LoopOverDimensions.scala b/Compiler/src/exastencils/baseExt/ir/IR_LoopOverDimensions.scala
index 1bf6669d9..e4d59609f 100644
--- a/Compiler/src/exastencils/baseExt/ir/IR_LoopOverDimensions.scala
+++ b/Compiler/src/exastencils/baseExt/ir/IR_LoopOverDimensions.scala
@@ -28,6 +28,7 @@ import exastencils.datastructures._
 import exastencils.logger.Logger
 import exastencils.optimization.ir._
 import exastencils.parallelization.ir._
+import exastencils.util.ir.IR_FragmentLoopCollector
 
 // FIXME: refactor
 object IR_LoopOverDimensions {
@@ -148,20 +149,8 @@ case class IR_LoopOverDimensions(
     }
   }
 
-  def parallelizationIsReasonable : Boolean = {
-    val maxItCount = maxIterationCount()
-    if (maxItCount == null)
-      return true // cannot determine iteration count, default is no change in parallelizability, i.e. true
-
-    var totalNumPoints : Long = 1
-    for (i <- maxItCount)
-      totalNumPoints *= i
-    totalNumPoints > Knowledge.omp_minWorkItemsPerThread * Knowledge.omp_numThreads
-  }
-
   def explParLoop = lcCSEApplied && parallelization.potentiallyParallel &&
-    Knowledge.omp_enabled && Knowledge.omp_parallelizeLoopOverDimensions &&
-    parallelizationIsReasonable
+    Knowledge.omp_enabled && parallelizationOverDimensionsIsReasonable(maxIterationCount())
 
   def createOMPThreadsWrapper(body : ListBuffer[IR_Statement]) : ListBuffer[IR_Statement] = {
     if (explParLoop) {
@@ -214,9 +203,9 @@ case class IR_LoopOverDimensions(
     nju
   }
 
-  def expandSpecial() : ListBuffer[IR_Statement] = {
+  def expandSpecial(collector : IR_FragmentLoopCollector) : ListBuffer[IR_Statement] = {
     def parallelizable(d : Int) = parallelization.potentiallyParallel && parDims.contains(d)
-    def parallelize(d : Int) = parallelizable(d) && Knowledge.omp_parallelizeLoopOverDimensions && parallelizationIsReasonable
+    def parallelize(d : Int) = parallelizable(d) && parallelizationOverDimensionsIsReasonable(maxIterationCount())
 
     // TODO: check interaction between at1stIt and condition (see also: TODO in polyhedron.Extractor.enterLoop)
     var wrappedBody : ListBuffer[IR_Statement] = body
@@ -250,6 +239,16 @@ case class IR_LoopOverDimensions(
       wrappedBody = ListBuffer[IR_Statement](loop)
     }
 
+    // propagate parallelization hints to enclosing fragment loop if parallel
+    if (Knowledge.omp_parallelizeLoopOverFragments && collector.getEnclosingFragmentLoop().isDefined) {
+      collector.getEnclosingFragmentLoop().get match {
+        case fragLoop : IR_LoopOverFragments                                                                            =>
+          fragLoop.parallelization.parallelizationReasonable &&= parallelizationOverFragmentsIsReasonable(maxIterationCount())
+        case fragLoop @ IR_ForLoop(IR_VariableDeclaration(_, name, _, _), _, _, _, _) if name == IR_LoopOverFragments.defIt.name =>
+          fragLoop.parallelization.parallelizationReasonable &&= parallelizationOverFragmentsIsReasonable(maxIterationCount())
+      }
+    }
+
     wrappedBody = createOMPThreadsWrapper(wrappedBody)
 
     wrappedBody
@@ -259,7 +258,11 @@ case class IR_LoopOverDimensions(
 /// IR_ResolveLoopOverDimensions
 
 object IR_ResolveLoopOverDimensions extends DefaultStrategy("Resolve LoopOverDimensions nodes") {
+  var collector = new IR_FragmentLoopCollector
+  this.register(collector)
+  this.onBefore = () => this.resetCollectors()
+
   this += new Transformation("Resolve", {
-    case loop : IR_LoopOverDimensions => loop.expandSpecial()
+    case loop : IR_LoopOverDimensions => loop.expandSpecial(collector)
   })
 }
diff --git a/Compiler/src/exastencils/baseExt/ir/IR_LoopOverFragments.scala b/Compiler/src/exastencils/baseExt/ir/IR_LoopOverFragments.scala
index 764dbd761..cbe7c208d 100644
--- a/Compiler/src/exastencils/baseExt/ir/IR_LoopOverFragments.scala
+++ b/Compiler/src/exastencils/baseExt/ir/IR_LoopOverFragments.scala
@@ -45,6 +45,10 @@ case class IR_LoopOverFragments(
     // TODO: separate omp and potentiallyParallel
     parallelization.potentiallyParallel = Knowledge.omp_enabled && Knowledge.omp_parallelizeLoopOverFragments && parallelization.potentiallyParallel
 
+    // if there is no loop found, we determine here if omp parallelization is reasonable
+    if (Knowledge.omp_enabled && Knowledge.omp_parallelizeLoopOverFragments & !body.exists(_.isInstanceOf[IR_HasParallelizationInfo]))
+      parallelization.potentiallyParallel &&= parallelizationOverFragmentsIsReasonable(Array(Knowledge.domain_numFragmentsPerBlock))
+
     val loop = IR_ForLoop(
       IR_VariableDeclaration(defIt, 0),
       IR_Lower(defIt, Knowledge.domain_numFragmentsPerBlock),
diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_Flags.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_Flags.scala
index 55f73bb69..616dd5bd4 100644
--- a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_Flags.scala
+++ b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_Flags.scala
@@ -23,9 +23,33 @@ import exastencils.base.ir._
 import exastencils.baseExt.ir._
 import exastencils.communication.ir._
 import exastencils.config._
+import exastencils.domain.ir.IR_IV_IsValidForDomain
 import exastencils.field.ir._
 import exastencils.prettyprinting._
 
+/// CUDA_DirtyFlagHelper
+
+object CUDA_DirtyFlagHelper {
+  def fragmentIdxIsValid(fragIdx : IR_Expression, domainIdx : IR_Expression) = {
+    if (fragIdx != IR_LoopOverFragments.defIt)
+      fragIdx >= 0 AndAnd fragIdx < Knowledge.domain_numFragmentsPerBlock AndAnd IR_IV_IsValidForDomain(domainIdx, fragIdx)
+    else
+      IR_IV_IsValidForDomain(domainIdx, fragIdx)
+  }
+}
+
+/// CUDA_DirtyFlagCase
+
+object CUDA_DirtyFlagCase extends Enumeration {
+  type Access = Value
+  final val ANNOT : String = "DirtyFlagCase"
+
+  // CLEAR       : field/buffer was not updated -> no transfer needed
+  // INTERMEDIATE: field/buffer was updated     -> possibly need to wait for event before setting to DIRTY
+  // DIRTY       : field/buffer was updated     -> transfer needed if execution hardware changes
+  final val CLEAR, INTERMEDIATE, DIRTY = Value
+}
+
 /// CUDA_HostDataUpdated
 
 // TODO: move to communication package?
@@ -35,7 +59,14 @@ case class CUDA_HostDataUpdated(override var field : IR_Field, override var slot
   override def usesFieldArrays : Boolean = !Knowledge.data_useFieldNamesAsIdx
 
   override def resolveName() = s"hostDataUpdated" + resolvePostfix(fragmentIdx.prettyprint, "", if (Knowledge.data_useFieldNamesAsIdx) field.name else field.index.toString, field.level.toString, "")
-  override def resolveDefValue() = Some(true)
+  override def resolveDefValue() = Some(CUDA_DirtyFlagCase.DIRTY.id)
+
+  override def resolveDatatype() = {
+    if (field.numSlots > 1)
+      IR_ArrayDatatype(IR_IntegerDatatype, field.numSlots)
+    else
+      IR_IntegerDatatype
+  }
 }
 
 /// CUDA_DeviceDataUpdated
@@ -47,7 +78,14 @@ case class CUDA_DeviceDataUpdated(override var field : IR_Field, override var sl
   override def usesFieldArrays : Boolean = !Knowledge.data_useFieldNamesAsIdx
 
   override def resolveName() = s"deviceDataUpdated" + resolvePostfix(fragmentIdx.prettyprint, "", if (Knowledge.data_useFieldNamesAsIdx) field.name else field.index.toString, field.level.toString, "")
-  override def resolveDefValue() = Some(false)
+  override def resolveDefValue() = Some(CUDA_DirtyFlagCase.CLEAR.id)
+
+  override def resolveDatatype() = {
+    if (field.numSlots > 1)
+      IR_ArrayDatatype(IR_IntegerDatatype, field.numSlots)
+    else
+      IR_IntegerDatatype
+  }
 }
 
 /// CUDA_HostBufferDataUpdated
@@ -57,8 +95,8 @@ case class CUDA_HostBufferDataUpdated(var field : IR_Field, var direction : Stri
   override def prettyprint(out : PpStream) : Unit = out << resolveAccess(resolveName(), fragmentIdx, IR_NullExpression, field.index, field.level, neighIdx)
 
   override def resolveName() = s"hostBufferDataUpdated_$direction" + resolvePostfix(fragmentIdx.prettyprint, "", field.index.toString, field.level.toString, neighIdx.prettyprint)
-  override def resolveDatatype() = IR_BooleanDatatype
-  override def resolveDefValue() = Some(false)
+  override def resolveDatatype() = IR_IntegerDatatype
+  override def resolveDefValue() = Some(CUDA_DirtyFlagCase.CLEAR.id)
 }
 
 /// CUDA_DeviceBufferDataUpdated
@@ -68,8 +106,8 @@ case class CUDA_DeviceBufferDataUpdated(var field : IR_Field, var direction : St
   override def prettyprint(out : PpStream) : Unit = out << resolveAccess(resolveName(), fragmentIdx, IR_NullExpression, field.index, field.level, neighIdx)
 
   override def resolveName() = s"deviceBufferDataUpdated_$direction" + resolvePostfix(fragmentIdx.prettyprint, "", field.index.toString, field.level.toString, neighIdx.prettyprint)
-  override def resolveDatatype() = IR_BooleanDatatype
-  override def resolveDefValue() = Some(false)
+  override def resolveDatatype() = IR_IntegerDatatype
+  override def resolveDefValue() = Some(CUDA_DirtyFlagCase.CLEAR.id)
 }
 
 /// CUDA_ExecutionMode
diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_HandleFragmentLoops.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_HandleFragmentLoops.scala
index 1f5002648..bc9dcf1cb 100644
--- a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_HandleFragmentLoops.scala
+++ b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_HandleFragmentLoops.scala
@@ -221,28 +221,42 @@ case class CUDA_HandleFragmentLoops(
 
     for (access <- fieldAccesses.toSeq.sortBy(_._1)) {
       val fieldData = access._2
-      val transferStream = CUDA_TransferStream(fieldData.field, Duplicate(fieldData.fragmentIdx))
+      val field = fieldData.field
+      val fragIdx = fieldData.fragmentIdx
+      val domainIdx = field.domain.index
+      val transferStream = CUDA_TransferStream(field, Duplicate(fragIdx))
 
       // add data sync statements
       if (syncBeforeHost(access._1, fieldAccesses.keys))
         beforeHost += CUDA_UpdateHostData(Duplicate(fieldData), transferStream).expand().inner // expand here to avoid global expand afterwards
 
       // update flags for written fields
-      if (syncAfterHost(access._1, fieldAccesses.keys))
-        afterHost += IR_Assignment(CUDA_HostDataUpdated(fieldData.field, Duplicate(fieldData.slot), Duplicate(fieldData.fragmentIdx)), IR_BooleanConstant(true))
+      if (syncAfterHost(access._1, fieldAccesses.keys)) {
+        val dirtyFlag = CUDA_HostDataUpdated(field, Duplicate(fieldData.slot), Duplicate(fragIdx))
+        val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx)
+        afterHost += IR_IfCondition(isValid AndAnd (dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id),
+          IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.DIRTY.id))
+      }
     }
 
     for (access <- bufferAccesses.toSeq.sortBy(_._1)) {
       val buffer = access._2
-      val transferStream = CUDA_TransferStream(buffer.field, Duplicate(buffer.fragmentIdx))
+      val field = buffer.field
+      val fragIdx = buffer.fragmentIdx
+      val domainIdx = field.domain.index
+      val transferStream = CUDA_TransferStream(field, Duplicate(fragIdx))
 
       // add buffer sync statements
       if (syncBeforeHost(access._1, bufferAccesses.keys))
         beforeHost += CUDA_UpdateHostBufferData(Duplicate(buffer), transferStream).expand().inner // expand here to avoid global expand afterwards
 
       // update flags for written buffers
-      if (syncAfterHost(access._1, bufferAccesses.keys))
-        afterHost += IR_Assignment(CUDA_HostBufferDataUpdated(buffer.field, buffer.direction, Duplicate(buffer.neighIdx), Duplicate(buffer.fragmentIdx)), IR_BooleanConstant(true))
+      if (syncAfterHost(access._1, bufferAccesses.keys)) {
+        val dirtyFlag = CUDA_HostBufferDataUpdated(field, buffer.direction, Duplicate(buffer.neighIdx), Duplicate(fragIdx))
+        val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx)
+        afterHost += IR_IfCondition(isValid AndAnd (dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id),
+          IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.DIRTY.id))
+      }
     }
 
     // - device sync stmts -
@@ -250,27 +264,41 @@ case class CUDA_HandleFragmentLoops(
     if (isParallel) {
       for (access <- fieldAccesses.toSeq.sortBy(_._1)) {
         val fieldData = access._2
-        val transferStream = CUDA_TransferStream(fieldData.field, Duplicate(fieldData.fragmentIdx))
+        val field = fieldData.field
+        val fragIdx = fieldData.fragmentIdx
+        val domainIdx = field.domain.index
+        val transferStream = CUDA_TransferStream(field, Duplicate(fragIdx))
 
         // add data sync statements
         if (syncBeforeDevice(access._1, fieldAccesses.keys))
           beforeDevice += CUDA_UpdateDeviceData(Duplicate(fieldData), transferStream).expand().inner // expand here to avoid global expand afterwards
 
         // update flags for written fields
-        if (syncAfterDevice(access._1, fieldAccesses.keys))
-          afterDevice += IR_Assignment(CUDA_DeviceDataUpdated(fieldData.field, Duplicate(fieldData.slot), Duplicate(fieldData.fragmentIdx)), IR_BooleanConstant(true))
+        if (syncAfterDevice(access._1, fieldAccesses.keys)) {
+          val dirtyFlag = CUDA_DeviceDataUpdated(field, Duplicate(fieldData.slot), Duplicate(fragIdx))
+          val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx)
+          afterDevice += IR_IfCondition(isValid AndAnd dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id,
+            IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.DIRTY.id))
+        }
       }
       for (access <- bufferAccesses.toSeq.sortBy(_._1)) {
         val buffer = access._2
-        val transferStream = CUDA_TransferStream(buffer.field, Duplicate(buffer.fragmentIdx))
+        val field = buffer.field
+        val fragIdx = buffer.fragmentIdx
+        val domainIdx = field.domain.index
+        val transferStream = CUDA_TransferStream(field, Duplicate(fragIdx))
 
         // add data sync statements
         if (syncBeforeDevice(access._1, bufferAccesses.keys))
           beforeDevice += CUDA_UpdateDeviceBufferData(Duplicate(buffer), transferStream).expand().inner // expand here to avoid global expand afterwards
 
         // update flags for written fields
-        if (syncAfterDevice(access._1, bufferAccesses.keys))
-          afterDevice += IR_Assignment(CUDA_DeviceBufferDataUpdated(buffer.field, buffer.direction, Duplicate(buffer.neighIdx), Duplicate(buffer.fragmentIdx)), IR_BooleanConstant(true))
+        if (syncAfterDevice(access._1, bufferAccesses.keys)) {
+          val dirtyFlag = CUDA_DeviceBufferDataUpdated(field, buffer.direction, Duplicate(buffer.neighIdx), Duplicate(fragIdx))
+          val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx)
+          afterDevice += IR_IfCondition(isValid AndAnd (dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id),
+            IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.DIRTY.id))
+        }
       }
     }
 
@@ -305,8 +333,11 @@ case class CUDA_HandleFragmentLoops(
       val redTarget = Duplicate(red.target)
 
       // move reduction towards "synchroFragLoop"
-      // -> OpenMP/MPI reduction occurs after accumulation in "synchroFragLoop"
-      loop.parallelization.reduction = None
+      // -> MPI reduction occurs after accumulation in "synchroFragLoop" (i.e. skip in enclosing frag loop and its inner dimension loop)
+      // -> OMP reduction occurs only for parallelization over IR_LoopOverDimensions, otherwise skipped as MPI reduction
+      loop.parallelization.reduction.get.skipMpi = true
+      loop.parallelization.reduction.get.skipOpenMP = !Knowledge.omp_parallelizeLoopOverDimensions
+
       syncAfterFragLoop.parallelization.reduction = Some(red)
 
       // force comp stream sync if comp kernels are not synced explicitly
diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_MemoryTransfer.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_MemoryTransfer.scala
index 060ddd3c5..ef6096c5b 100644
--- a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_MemoryTransfer.scala
+++ b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_MemoryTransfer.scala
@@ -82,11 +82,10 @@ case class CUDA_UpdateHostData(var fieldData : IR_IV_FieldData, stream : CUDA_Tr
   override def expand() : Output[IR_Statement] = {
     val field = fieldData.field
     val fragIdx = fieldData.fragmentIdx
-    val isDirty = CUDA_DeviceDataUpdated(field, Duplicate(fieldData.slot), Duplicate(fragIdx))
-    val isValid = if (fragIdx != IR_LoopOverFragments.defIt)
-      fragIdx >= 0 AndAnd fragIdx < Knowledge.domain_numFragmentsPerBlock AndAnd IR_IV_IsValidForDomain(field.domain.index, fragIdx)
-    else
-      IR_IV_IsValidForDomain(field.domain.index, fragIdx)
+    val domainIdx = field.domain.index
+    val flag = CUDA_DeviceDataUpdated(field, Duplicate(fieldData.slot), Duplicate(fragIdx))
+    val isDirty = flag EqEq CUDA_DirtyFlagCase.DIRTY.id
+    val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx)
 
     if (Knowledge.cuda_useZeroCopy || List("both", "device_to_host").contains(Knowledge.cuda_eliminate_memory_transfers))
       return IR_IfCondition(IR_Negation(CUDA_IssuedSyncForEliminatedTransfer()) AndAnd isValid AndAnd isDirty,
@@ -102,6 +101,7 @@ case class CUDA_UpdateHostData(var fieldData : IR_IV_FieldData, stream : CUDA_Tr
           (0 until field.layout.numDimsData).map(dim => field.layout.idxById("TOT", dim)).reduceLeft(_ * _) * IR_SizeOf(field.resolveBaseDatatype),
           "D2H",
           stream),
+        IR_Assignment(flag, CUDA_DirtyFlagCase.INTERMEDIATE.id),
         CUDA_EventRecord(CUDA_PendingStreamTransfers(field, fragIdx), stream)))
   }
 }
@@ -117,11 +117,10 @@ case class CUDA_UpdateDeviceData(var fieldData : IR_IV_FieldData, stream : CUDA_
   override def expand() : Output[IR_Statement] = {
     val field = fieldData.field
     val fragIdx = fieldData.fragmentIdx
-    val isDirty = CUDA_HostDataUpdated(field, Duplicate(fieldData.slot), Duplicate(fragIdx))
-    val isValid = if (fragIdx != IR_LoopOverFragments.defIt)
-      fragIdx >= 0 AndAnd fragIdx < Knowledge.domain_numFragmentsPerBlock AndAnd IR_IV_IsValidForDomain(field.domain.index, fragIdx)
-    else
-      IR_IV_IsValidForDomain(field.domain.index, fragIdx)
+    val domainIdx = field.domain.index
+    val flag = CUDA_HostDataUpdated(field, Duplicate(fieldData.slot), Duplicate(fragIdx))
+    val isDirty = flag EqEq CUDA_DirtyFlagCase.DIRTY.id
+    val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx)
 
     if (Knowledge.cuda_useZeroCopy || List("both", "host_to_device").contains(Knowledge.cuda_eliminate_memory_transfers))
       return IR_IfCondition(IR_Negation(CUDA_IssuedSyncForEliminatedTransfer()) AndAnd isValid AndAnd isDirty,
@@ -137,6 +136,7 @@ case class CUDA_UpdateDeviceData(var fieldData : IR_IV_FieldData, stream : CUDA_
           (0 until field.layout.numDimsData).map(dim => field.layout.idxById("TOT", dim)).reduceLeft(_ * _) * IR_SizeOf(field.resolveBaseDatatype),
           "H2D",
           stream),
+        IR_Assignment(flag, CUDA_DirtyFlagCase.INTERMEDIATE.id),
         CUDA_EventRecord(CUDA_PendingStreamTransfers(field, fragIdx), stream)))
   }
 }
@@ -146,15 +146,19 @@ case class CUDA_UpdateDeviceData(var fieldData : IR_IV_FieldData, stream : CUDA_
 case class CUDA_UpdateHostBufferData(var buffer : IR_IV_CommBuffer, stream : CUDA_TransferStream) extends CUDA_HostStatement with IR_Expandable {
   override def expand() : Output[IR_Statement] = {
     val field = buffer.field
-    val isDirty = CUDA_DeviceBufferDataUpdated(field, buffer.direction, Duplicate(buffer.neighIdx))
+    val fragIdx = buffer.fragmentIdx
+    val domainIdx = field.domain.index
+    val flag = CUDA_DeviceBufferDataUpdated(field, buffer.direction, Duplicate(buffer.neighIdx))
+    val isDirty = flag EqEq CUDA_DirtyFlagCase.DIRTY.id
+    val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx)
 
     if (Knowledge.cuda_useZeroCopy || List("both", "device_to_host").contains(Knowledge.cuda_eliminate_memory_transfers))
-      return IR_IfCondition(IR_AndAnd(IR_Negation(CUDA_IssuedSyncForEliminatedTransfer()), isDirty),
+      return IR_IfCondition(IR_Negation(CUDA_IssuedSyncForEliminatedTransfer()) AndAnd isDirty AndAnd isValid,
         ListBuffer[IR_Statement](
           CUDA_DeviceSynchronize(),
           IR_Assignment(CUDA_IssuedSyncForEliminatedTransfer(), true)))
 
-    IR_IfCondition(isDirty,
+    IR_IfCondition(isValid AndAnd isDirty,
       ListBuffer[IR_Statement](
         CUDA_TransferUtil.genTransfer(
           Duplicate(buffer),
@@ -162,7 +166,8 @@ case class CUDA_UpdateHostBufferData(var buffer : IR_IV_CommBuffer, stream : CUD
           Duplicate(buffer.size) * IR_SizeOf(field.resolveBaseDatatype),
           "D2H",
           stream),
-        CUDA_EventRecord(CUDA_PendingStreamTransfers(field, buffer.fragmentIdx), stream)))
+        IR_Assignment(flag, CUDA_DirtyFlagCase.INTERMEDIATE.id),
+        CUDA_EventRecord(CUDA_PendingStreamTransfers(field, fragIdx), stream)))
   }
 }
 
@@ -171,15 +176,19 @@ case class CUDA_UpdateHostBufferData(var buffer : IR_IV_CommBuffer, stream : CUD
 case class CUDA_UpdateDeviceBufferData(var buffer : IR_IV_CommBuffer, stream : CUDA_TransferStream) extends CUDA_HostStatement with IR_Expandable {
   override def expand() : Output[IR_Statement] = {
     val field = buffer.field
-    val isDirty = CUDA_HostBufferDataUpdated(field, buffer.direction, Duplicate(buffer.neighIdx))
+    val fragIdx = buffer.fragmentIdx
+    val domainIdx = field.domain.index
+    val flag = CUDA_HostBufferDataUpdated(field, buffer.direction, Duplicate(buffer.neighIdx))
+    val isDirty = flag EqEq CUDA_DirtyFlagCase.DIRTY.id
+    val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx)
 
     if (Knowledge.cuda_useZeroCopy || List("both", "host_to_device").contains(Knowledge.cuda_eliminate_memory_transfers))
-      return IR_IfCondition(IR_AndAnd(IR_Negation(CUDA_IssuedSyncForEliminatedTransfer()), isDirty),
+      return IR_IfCondition(IR_Negation(CUDA_IssuedSyncForEliminatedTransfer()) AndAnd isDirty AndAnd isValid,
         ListBuffer[IR_Statement](
           CUDA_DeviceSynchronize(),
           IR_Assignment(CUDA_IssuedSyncForEliminatedTransfer(), true)))
 
-    IR_IfCondition(isDirty,
+    IR_IfCondition(isValid AndAnd isDirty,
       ListBuffer[IR_Statement](
         CUDA_TransferUtil.genTransfer(
           Duplicate(buffer),
@@ -187,6 +196,7 @@ case class CUDA_UpdateDeviceBufferData(var buffer : IR_IV_CommBuffer, stream : C
           Duplicate(buffer.size) * IR_SizeOf(field.resolveBaseDatatype),
           "H2D",
           stream),
-        CUDA_EventRecord(CUDA_PendingStreamTransfers(field, buffer.fragmentIdx), stream)))
+        IR_Assignment(flag, CUDA_DirtyFlagCase.INTERMEDIATE.id),
+        CUDA_EventRecord(CUDA_PendingStreamTransfers(field, fragIdx), stream)))
   }
 }
diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareFragmentLoops.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareFragmentLoops.scala
index cc63a79f9..45272c0f1 100644
--- a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareFragmentLoops.scala
+++ b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareFragmentLoops.scala
@@ -4,6 +4,7 @@ import scala.collection.mutable
 import scala.collection.mutable.ListBuffer
 
 import exastencils.base.ir._
+import exastencils.base.ir.IR_ImplicitConversion._
 import exastencils.baseExt.ir._
 import exastencils.communication.ir.IR_IV_CommBuffer
 import exastencils.core.Duplicate
@@ -104,26 +105,62 @@ trait CUDA_PrepareFragmentLoops extends CUDA_PrepareBufferSync with CUDA_Executi
       val fieldData = access._2
       if (syncBeforeHost(access._1, fieldAccesses.keys)) {
         val dirtyFlag = CUDA_DeviceDataUpdated(fieldData.field, Duplicate(fieldData.slot), Duplicate(fieldData.fragmentIdx))
-        beforeHost += IR_IfCondition(dirtyFlag,
+        beforeHost += IR_IfCondition(dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id,
           ListBuffer[IR_Statement](
             CUDA_WaitEvent(CUDA_PendingStreamTransfers(fieldData.field, Duplicate(fieldData.fragmentIdx)), stream, "D2H"),
-            IR_Assignment(dirtyFlag, IR_BooleanConstant(false))))
+            IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.CLEAR.id)))
       }
     }
     for (access <- bufferAccesses.toSeq.sortBy(_._1)) {
       val buffer = access._2
       if (syncBeforeHost(access._1, bufferAccesses.keys)) {
         val dirtyFlag = CUDA_DeviceBufferDataUpdated(buffer.field, buffer.direction, Duplicate(buffer.neighIdx))
-        beforeHost += IR_IfCondition(dirtyFlag,
+        beforeHost += IR_IfCondition(dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id,
           ListBuffer[IR_Statement](
             CUDA_WaitEvent(CUDA_PendingStreamTransfers(buffer.field, Duplicate(buffer.fragmentIdx)), stream, "D2H"),
-            IR_Assignment(dirtyFlag, IR_BooleanConstant(false))))
+            IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.CLEAR.id)))
       }
     }
 
     beforeHost
   }
 
+  def syncFlagsAfterHost() = {
+    var afterHost : ListBuffer[IR_Statement] = ListBuffer()
+
+    // update flags for written fields/buffers (check for valid fragment already implicitly done in surrounding loop)
+    for (access <- fieldAccesses.toSeq.sortBy(_._1)) {
+      if (syncAfterHost(access._1, fieldAccesses.keys))
+        afterHost += IR_Assignment(CUDA_HostDataUpdated(access._2.field, Duplicate(access._2.slot)),
+          CUDA_DirtyFlagCase.INTERMEDIATE.id)
+    }
+    for (access <- bufferAccesses.toSeq.sortBy(_._1)) {
+      if (syncAfterHost(access._1, bufferAccesses.keys))
+        afterHost += IR_Assignment(CUDA_HostBufferDataUpdated(access._2.field, access._2.direction, Duplicate(access._2.neighIdx)),
+          CUDA_DirtyFlagCase.INTERMEDIATE.id)
+    }
+
+    afterHost
+  }
+
+  def syncFlagsAfterDevice() = {
+    var afterDevice : ListBuffer[IR_Statement] = ListBuffer()
+
+    // update flags for written fields/buffers (check for valid fragment already implicitly done in surrounding loop)
+    for (access <- fieldAccesses.toSeq.sortBy(_._1)) {
+      if (syncAfterDevice(access._1, fieldAccesses.keys))
+        afterDevice += IR_Assignment(CUDA_DeviceDataUpdated(access._2.field, Duplicate(access._2.slot), access._2.fragmentIdx),
+          CUDA_DirtyFlagCase.INTERMEDIATE.id)
+    }
+    for (access <- bufferAccesses.toSeq.sortBy(_._1)) {
+      if (syncAfterDevice(access._1, bufferAccesses.keys))
+        afterDevice += IR_Assignment(CUDA_DeviceBufferDataUpdated(access._2.field, access._2.direction, Duplicate(access._2.neighIdx), access._2.fragmentIdx),
+          CUDA_DirtyFlagCase.INTERMEDIATE.id)
+    }
+
+    afterDevice
+  }
+
   def syncEventsBeforeDevice(stream : CUDA_Stream) = {
     var beforeDevice : ListBuffer[IR_Statement] = ListBuffer()
 
@@ -131,21 +168,29 @@ trait CUDA_PrepareFragmentLoops extends CUDA_PrepareBufferSync with CUDA_Executi
     for (access <- fieldAccesses.toSeq.sortBy(_._1)) {
       val fieldData = access._2
       if (syncBeforeDevice(access._1, fieldAccesses.keys)) {
-        val dirtyFlag = CUDA_HostDataUpdated(fieldData.field, Duplicate(fieldData.slot), Duplicate(fieldData.fragmentIdx))
-        beforeDevice += IR_IfCondition(dirtyFlag,
+        val field = fieldData.field
+        val fragIdx = fieldData.fragmentIdx
+        val domainIdx = field.domain.index
+        val dirtyFlag = CUDA_HostDataUpdated(field, Duplicate(fieldData.slot), Duplicate(fieldData.fragmentIdx))
+        val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx)
+        beforeDevice += IR_IfCondition(isValid AndAnd (dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id),
           ListBuffer[IR_Statement](
-            CUDA_WaitEvent(CUDA_PendingStreamTransfers(fieldData.field, Duplicate(fieldData.fragmentIdx)), stream, "H2D"),
-            IR_Assignment(dirtyFlag, IR_BooleanConstant(false))))
+            CUDA_WaitEvent(CUDA_PendingStreamTransfers(field, Duplicate(fieldData.fragmentIdx)), stream, "H2D"),
+            IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.CLEAR.id)))
       }
     }
     for (access <- bufferAccesses.toSeq.sortBy(_._1)) {
       val buffer = access._2
       if (syncBeforeDevice(access._1, bufferAccesses.keys)) {
+        val field = buffer.field
+        val fragIdx = buffer.fragmentIdx
+        val domainIdx = field.domain.index
         val dirtyFlag = CUDA_HostBufferDataUpdated(buffer.field, buffer.direction, Duplicate(buffer.neighIdx))
-        beforeDevice += IR_IfCondition(dirtyFlag,
+        val isValid = CUDA_DirtyFlagHelper.fragmentIdxIsValid(fragIdx, domainIdx)
+        beforeDevice += IR_IfCondition(isValid AndAnd (dirtyFlag EqEq CUDA_DirtyFlagCase.INTERMEDIATE.id),
           ListBuffer[IR_Statement](
             CUDA_WaitEvent(CUDA_PendingStreamTransfers(buffer.field, Duplicate(buffer.fragmentIdx)), stream, "H2D"),
-            IR_Assignment(dirtyFlag, IR_BooleanConstant(false))))
+            IR_Assignment(dirtyFlag, CUDA_DirtyFlagCase.CLEAR.id)))
       }
     }
 
diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareHostCode.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareHostCode.scala
index 3efc0bcae..4b2f5453b 100644
--- a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareHostCode.scala
+++ b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareHostCode.scala
@@ -33,6 +33,7 @@ import exastencils.logger.Logger
 import exastencils.parallelization.ir.IR_HasParallelizationInfo
 import exastencils.util.NoDuplicateWrapper
 import exastencils.util.ir._
+import exastencils.base.ir.IR_ImplicitConversion._
 
 /// CUDA_PrepareHostCode
 
@@ -75,7 +76,7 @@ object CUDA_PrepareHostCode extends DefaultStrategy("Prepare CUDA relevant code
     bufferAccesses ++= gatherBuffers.bufferAccesses
   }
 
-  def getHostDeviceSyncStmts(body : ListBuffer[IR_Statement], isParallel : Boolean, executionStream: CUDA_Stream) = {
+  def getHostDeviceSyncStmts(body : ListBuffer[IR_Statement], isParallel : Boolean, executionStream : CUDA_Stream) = {
     val (beforeHost, afterHost) = (ListBuffer[IR_Statement](), ListBuffer[IR_Statement]())
     val (beforeDevice, afterDevice) = (ListBuffer[IR_Statement](), ListBuffer[IR_Statement]())
 
@@ -86,11 +87,15 @@ object CUDA_PrepareHostCode extends DefaultStrategy("Prepare CUDA relevant code
 
     beforeHost ++= syncEventsBeforeHost(executionStream)
 
+    afterHost ++= syncFlagsAfterHost()
+
     // device sync stmts
 
     if (isParallel) {
       if (!Knowledge.experimental_cuda_useStreams && !Knowledge.cuda_omitSyncDeviceAfterKernelCalls)
         afterDevice += CUDA_DeviceSynchronize()
+
+      afterDevice ++= syncFlagsAfterDevice()
     }
 
     beforeDevice ++= syncEventsBeforeDevice(executionStream)
@@ -99,7 +104,7 @@ object CUDA_PrepareHostCode extends DefaultStrategy("Prepare CUDA relevant code
   }
 
   // extract estimated times for host/device from performance evaluation strategy (zero if estimation doesn't exist)
-  def getTimeEstimation(loop: IR_LoopOverDimensions, host: Boolean) =
+  def getTimeEstimation(loop : IR_LoopOverDimensions, host : Boolean) =
     loop.getAnnotation(if (host) "perf_timeEstimate_host" else "perf_timeEstimate_device").getOrElse(0.0).asInstanceOf[Double]
 
   // use condWrapper to prevent automatic removal of branching from simplification strategies
diff --git a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareMPICode.scala b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareMPICode.scala
index 9c97f6e75..a9d75cf82 100644
--- a/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareMPICode.scala
+++ b/Compiler/src/exastencils/parallelization/api/cuda/CUDA_PrepareMPICode.scala
@@ -175,13 +175,17 @@ object CUDA_PrepareMPICode extends DefaultStrategy("Prepare CUDA relevant code b
 
     beforeHost ++= syncEventsBeforeHost(executionStream)
 
+    afterHost ++= syncFlagsAfterHost()
+
     // device sync stmts
 
-    if (!Knowledge.experimental_cuda_useStreams && !Knowledge.cuda_omitSyncDeviceAfterKernelCalls)
-      afterDevice += CUDA_DeviceSynchronize()
+    if (!Knowledge.experimental_cuda_useStreams)
+      beforeDevice += CUDA_DeviceSynchronize()
 
     beforeDevice ++= syncEventsBeforeDevice(executionStream)
 
+    afterDevice ++= syncFlagsAfterDevice()
+
     (beforeHost, afterHost, beforeDevice, afterDevice)
   }
 
diff --git a/Compiler/src/exastencils/parallelization/api/omp/OMP_Loop.scala b/Compiler/src/exastencils/parallelization/api/omp/OMP_Loop.scala
index 87f9d9e03..77c087a65 100644
--- a/Compiler/src/exastencils/parallelization/api/omp/OMP_Loop.scala
+++ b/Compiler/src/exastencils/parallelization/api/omp/OMP_Loop.scala
@@ -123,7 +123,7 @@ object OMP_AddParallelSections extends DefaultStrategy("Handle potentially paral
     case target : IR_ForLoop if target.parallelization.potentiallyParallel && target.parallelization.parallelizationReasonable && !target.hasAnnotation(CUDA_Util.CUDA_LOOP_ANNOTATION) =>
       val additionalOMPClauses = ListBuffer[OMP_Clause]()
 
-      if (target.parallelization.reduction.isDefined)
+      if (target.parallelization.reduction.isDefined && !target.parallelization.reduction.get.skipOpenMP)
         additionalOMPClauses += OMP_Reduction(target.parallelization.reduction.get)
 
       if (target.parallelization.privateVars.nonEmpty)
diff --git a/Compiler/src/exastencils/parallelization/ir/IR_ParallelizationInfo.scala b/Compiler/src/exastencils/parallelization/ir/IR_ParallelizationInfo.scala
index ce3096547..feb2ecc61 100644
--- a/Compiler/src/exastencils/parallelization/ir/IR_ParallelizationInfo.scala
+++ b/Compiler/src/exastencils/parallelization/ir/IR_ParallelizationInfo.scala
@@ -21,6 +21,7 @@ package exastencils.parallelization.ir
 import scala.collection.mutable.ListBuffer
 
 import exastencils.base.ir._
+import exastencils.config.Knowledge
 
 /// IR_ParallelizationInfo
 
@@ -49,4 +50,22 @@ case class IR_ParallelizationInfo(
 
 trait IR_HasParallelizationInfo {
   var parallelization : IR_ParallelizationInfo
+
+  def parallelizationOverDimensionsIsReasonable(maxIterationCount : Array[Long]) : Boolean = if (Knowledge.omp_parallelizeLoopOverDimensions) {
+    if (maxIterationCount == null)
+      return true // cannot determine iteration count, default is no change in parallelizability, i.e. true
+
+    maxIterationCount.product > Knowledge.omp_minWorkItemsPerThread * Knowledge.omp_numThreads
+  } else {
+    false
+  }
+
+  def parallelizationOverFragmentsIsReasonable(maxIterationCount : Array[Long]) : Boolean = if (Knowledge.omp_parallelizeLoopOverFragments) {
+    if (maxIterationCount == null)
+      return true // cannot determine iteration count, default is no change in parallelizability, i.e. true
+
+    math.max(Knowledge.domain_numFragmentsPerBlock / Knowledge.omp_numThreads, 1) * maxIterationCount.product > Knowledge.omp_minWorkItemsPerThread
+  } else {
+    false
+  }
 }
diff --git a/Compiler/src/exastencils/polyhedron/IR_PolyExtractor.scala b/Compiler/src/exastencils/polyhedron/IR_PolyExtractor.scala
index 52b0993ed..d317edd02 100644
--- a/Compiler/src/exastencils/polyhedron/IR_PolyExtractor.scala
+++ b/Compiler/src/exastencils/polyhedron/IR_PolyExtractor.scala
@@ -373,7 +373,7 @@ class IR_PolyExtractor extends Collector {
         mergeStmts : Boolean) : Unit = {
 
       this.scop_ = new Scop(root, localContext, globalContext, optLevel,
-        Knowledge.omp_parallelizeLoopOverDimensions && root.parallelizationIsReasonable, root.maxIterationCount())
+        Knowledge.omp_parallelizeLoopOverDimensions && root.parallelizationOverDimensionsIsReasonable(root.maxIterationCount()), root.maxIterationCount())
       if (mergeWithPrev)
         scops.last.nextMerge = this.scop_
       this.modelLoopVars_ = modelLoopVars
@@ -500,7 +500,7 @@ class IR_PolyExtractor extends Collector {
             if (loop.parallelization.reduction.isDefined)
               loop.parallelization.reduction.get.annotate(SKIP_ANNOT)
             // loop.at1stIt is a list of tuple, StateManager does not handle these, so a skip annotation is not required
-            if (loop.parallelizationIsReasonable && loop.polyOptLevel >= 1)
+            if (loop.parallelizationOverDimensionsIsReasonable(loop.maxIterationCount()) && loop.polyOptLevel >= 1)
               enterLoop(loop, merge)
 
           case _ =>
diff --git a/Testing/Misc/reduction.exa4 b/Testing/Misc/reduction.exa4
index da83f5661..cf79bbb71 100644
--- a/Testing/Misc/reduction.exa4
+++ b/Testing/Misc/reduction.exa4
@@ -24,7 +24,6 @@ Function Application {
 	}
 
     Var redTarget : Real = 0.0
-
     loop over mat@finest with reduction (+: redTarget) {
         redTarget += mat@finest[2][2]
     }
@@ -40,32 +39,32 @@ Function Application {
 
     Var redTarget2 : Matrix<Real, 2, 2>
     loop over mat@finest with reduction (+: redTarget2[0][0]) {
-        redTarget2[0][0] = matrixEntry
+        redTarget2[0][0] += matrixEntry
     }
 
-    if ( fabs(redTarget2[0][0] - numReds * matrixEntry ) <= eps) {
+    if ( fabs(redTarget2[0][0] - expected ) <= eps) {
         print('Passed stage 1: Reduced matrix entry equals the number of reductions times the original value of the entry')
     } else {
-        print('Failed stage 1: Reduced matrix entry redTarget2[0][0]=', redTarget2[0][0], ', expected=', numReds * matrixEntry)
+        print('Failed stage 1: Reduced matrix entry redTarget2[0][0]=', redTarget2[0][0], ', expected=', expected)
     }
 
     loop over fragments with reduction (max: redTarget2[0][0]) {
         if (fragmentIdx == 0) {
-            redTarget2[0][0] = maxVal
+            redTarget2[0][0] = max(maxVal, redTarget2[0][0])
         }
     }
 
     if ( fabs(redTarget2[0][0] - maxVal) <= eps ) {
-        print('Passed stage 2: redTarget2[0][0] equals maxVal')
+        print('Passed stage 2: Reduced matrix entry redTarget2[0][0] equals maxVal')
     } else {
-        print('Failed stage 2: redTarget=', redTarget2[0][0], ', maxVal=', maxVal)
+        print('Failed stage 2: Reduced matrix entry redTarget2[0][0]=', redTarget2[0][0], ', expected=', maxVal)
     }
 
     Var redTarget3 : Matrix<Real, 5, 1> = 0.
 
     loop over fragments with reduction (+: redTarget3) {
-        redTarget3[0] = maxVal
-        redTarget3[1] = minVal
+        redTarget3[0] += maxVal
+        redTarget3[1] += minVal
     }
 
     if ( fabs(redTarget3[0][0] - numReds * maxVal) <= eps &&
@@ -74,30 +73,30 @@ Function Application {
          fabs(redTarget3[3][0]) <= eps &&
          fabs(redTarget3[4][0]) <= eps ) {
 
-        print('Passed stage 3')
+        print('Passed stage 3: Reduced matrix matches expected target matrix')
     } else {
 
         print('Failed stage 3: redTarget3[0][0]=', redTarget3[0][0], ', redTarget3[1][0]=', redTarget3[1][0],
             ', redTarget3[2][0]=', redTarget3[2][0], ', redTarget3[3][0]=', redTarget3[3][0],
-            ', redTarget3[4][0]=', redTarget3[4][0])
+            ', redTarget3[4][0]=', redTarget3[4][0],
+            '. Expected = {', numReds * maxVal, ',', numReds * minVal, ', 0., 0., 0.}')
     }
 
-    Var redTarget4 : Matrix<Real, 2, 1> = {{maxVal}, {minVal}}
-    Expr numReds2 = getKnowledge ( 'domain_rect_numBlocks_x' ) // no reduction taking place on OpenMP site
+    Var redTarget4 : Matrix<Real, 2, 1> = {{matrixEntry}, {matrixEntry}}
 
-    loop over fragments with reduction (+: redTarget4) {
-        if (sqrt(4) != 2) {
-            print('dummy line')
-        }
+    loop over fragments with reduction (min: redTarget4) {
+        redTarget4[0][0] = min(redTarget4[0][0], minVal)
+        redTarget4[1][0] = min(redTarget4[1][0], maxVal)
     }
 
-    if ( fabs(redTarget4[0][0] - numReds2 * maxVal) <= eps &&
-         fabs(redTarget4[1][0] - numReds2 * minVal) <= eps) {
+    if ( fabs(redTarget4[0][0] - minVal) <= eps &&
+         fabs(redTarget4[1][0] - matrixEntry) <= eps) {
 
-        print('Passed stage 4')
+        print('Passed stage 4: Reduced matrix matches expected target matrix')
     } else {
 
-        print('Failed stage 4: redTarget4[0][0]=', redTarget4[0][0], ', redTarget4[1][0]=', redTarget4[1][0])
+        print('Failed stage 4: redTarget4[0][0]=', redTarget4[0][0], ', redTarget4[1][0]=', redTarget4[1][0],
+            '. Expected: {', minVal, ',', matrixEntry, '}')
     }
 
 	destroyGlobals (  )
diff --git a/Testing/Misc/reduction.results b/Testing/Misc/reduction.results
index 9fb91e614..27d42aa6c 100644
--- a/Testing/Misc/reduction.results
+++ b/Testing/Misc/reduction.results
@@ -1,5 +1,5 @@
 Passed stage 0: redTarget equals expected result
 Passed stage 1: Reduced matrix entry equals the number of reductions times the original value of the entry
-Passed stage 2: redTarget2[0][0] equals maxVal
-Passed stage 3
-Passed stage 4
\ No newline at end of file
+Passed stage 2: Reduced matrix entry redTarget2[0][0] equals maxVal
+Passed stage 3: Reduced matrix matches expected target matrix
+Passed stage 4: Reduced matrix matches expected target matrix
\ No newline at end of file