From 03a52f1a5fae28d90b133478c2ffe1d567a03dba Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 31 Oct 2023 01:54:44 -0700
Subject: [PATCH] Do not use deprecated usePropertiesAsAttributes=0 for
 tf_device ops.

PiperOrigin-RevId: 578101119
---
 .../compiler/mlir/tensorflow/ir/tf_device.cc  |   4 +-
 .../mlir/tensorflow/ir/tf_device_ops.td       |   2 -
 .../tensorflow/tests/breakup-islands.mlir     |   7 +-
 .../tensorflow/tests/cluster_formation.mlir   |  30 +--
 .../tensorflow/tests/cluster_outlining.mlir   |   8 +-
 .../tests/device_attribute_to_launch.mlir     |   4 +-
 ...extract_head_tail_outside_compilation.mlir |  46 ++---
 .../tests/extract_outside_compilation.mlir    |   6 +-
 ...xtract_tpu_copy_with_dynamic_shape_op.mlir |   8 +-
 .../host_launch_to_outside_compiled.mlir      |   2 +-
 .../tensorflow/tests/launch_outlining.mlir    |   8 +-
 .../outside_compiled_to_host_launch.mlir      |   8 +-
 .../replicate_invariant_op_hoisting.mlir      |  20 +-
 .../tensorflow/tests/replicate_to_island.mlir |  44 +++--
 .../tests/replicate_to_island_legacy.mlir     |  12 +-
 .../tests/tpu-dynamic-layout-pass.mlir        |  10 +-
 .../tpu-merge-variables-with-execute.mlir     |  22 ++-
 .../tpu-variable-runtime-reformatting.mlir    |  12 +-
 .../tpu_colocate_composite_resource_ops.mlir  |   8 +-
 .../mlir/tensorflow/tests/tpu_rewrite.mlir    | 177 ++++++------------
 .../mlir/tensorflow/tests/xla_rewrite_v2.mlir |  22 +--
 .../mlir/tfrt/tests/runtime_lowering_tpu.mlir |   4 +-
 .../mlir/tests/move_compilation_to_host.mlir  |  16 +-
 .../mlir/tests/multi_device_expansion.mlir    |  32 ++--
 .../mlir/tests/update_tpu_metadata.mlir       |  18 +-
 25 files changed, 252 insertions(+), 278 deletions(-)
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index f5a6cf45a4fb17..46cfa429c42573 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -387,7 +387,7 @@ void ReplicateOp::print(OpAsmPrinter& p) {
   //   packed_input
   //     %b as %block_arg1: type
   const int32_t n = this->getN();
-  const int32_t num_replicated_inputs = getOperandSegmentSizes()[0];
+  const int32_t num_replicated_inputs = getProperties().operandSegmentSizes[0];
   const int32_t num_replicated_block_args = num_replicated_inputs / n;
 
   if (getNumOperands()) {
@@ -502,7 +502,7 @@ LogicalResult ReplicateOp::verify() {
 
   Block& block = op.getBody().front();
 
-  auto operandSegmentSizes = op.getOperandSegmentSizes();
+  auto operandSegmentSizes = op.getProperties().operandSegmentSizes;
   const int32_t num_replicated_inputs = operandSegmentSizes[0];
   const int32_t num_packed_inputs = operandSegmentSizes[1];
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
index c0147386ab56d3..343127301d4057 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
@@ -39,7 +39,6 @@ def TfDevice_Dialect : Dialect {
 }];
 
   let cppNamespace = "::mlir::tf_device";
-  let usePropertiesForAttributes = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -263,7 +262,6 @@ For example:
     Variadic<AnyType>:$replicated_inputs,
     Variadic<AnyType>:$packed_inputs,
 
-    DenseI32ArrayAttr:$operandSegmentSizes,
     ConfinedAttr<I32Attr, [IntMinValue<2>]>:$n,
     OptionalAttr<DictionaryAttr>:$devices
   );
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
index e94cb5f859ec34..2704fc38b4d569 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
@@ -544,16 +544,17 @@ func.func @island_not_direct_parent_of_user() -> () {
       tf_executor.yield %0 : tensor<i64>
     }
     // CHECK: "tf_device.launch"()
+    // CHECK-SAME: <{device = "/job:worker/replica:0/task:0/device:CPU:0"}>
     // CHECK:   "tf.OpC"(%[[VAL_0]]) : (tensor<i64>) -> ()
     // CHECK:   "tf.OpD"() : () -> ()
     // CHECK:   tf_device.return
-    // CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> ()
+    // CHECK: }) : () -> ()
     %island2 = tf_executor.island {
-      "tf_device.launch"() ({
+      "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:CPU:0"}> ({
         "tf.OpC"(%island1#0) : (tensor<i64>) -> ()
         "tf.OpD"() : () -> ()
         tf_device.return
-      }) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> ()
+      }) : () -> ()
       tf_executor.yield
     }
     // CHECK: tf_executor.fetch
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir
index 16acdcec5eda10..5f045823323f21 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/cluster_formation.mlir
@@ -10,6 +10,7 @@ module {
     %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
     // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+    // CHECK-SAME: <{device = "tpu0"}>
     // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
     %3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
 
@@ -17,7 +18,7 @@ module {
     %4 = "tf.C"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
 
     // CHECK: tf_device.return %[[C_OUTPUT]]
-    // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+    // CHECK: : () -> tensor<?xi32>
 
     // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[TPU0_OUTPUT]])
     %5 = "tf.D"(%4) : (tensor<?xi32>) -> tensor<?xi32>
@@ -40,6 +41,7 @@ module {
         %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK-SAME: <{device = "tpu0"}>
         // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
         %3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
 
@@ -47,7 +49,7 @@ module {
         %4 = "tf.C"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: tf_device.return %[[C_OUTPUT]]
-        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+        // CHECK: : () -> tensor<?xi32>
 
         // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[TPU0_OUTPUT]])
         %5 = "tf.D"(%4) : (tensor<?xi32>) -> tensor<?xi32>
@@ -71,6 +73,7 @@ module {
       %1:2 = tf_executor.island {
 
         // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK-SAME: <{device = "tpu0"}>
         // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]]) : (tensor<?xi32>) -> tensor<?xi32>
         %3 = "tf.A"(%arg0) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
 
@@ -78,7 +81,7 @@ module {
         %4 = "tf.B"(%3, %arg0) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: tf_device.return %[[B_OUTPUT]]
-        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+        // CHECK: : () -> tensor<?xi32>
 
         // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[TPU0_OUTPUT]])
         %5 = "tf.C"(%4) : (tensor<?xi32>) -> tensor<?xi32>
@@ -104,6 +107,7 @@ module {
 
       %2:2 = tf_executor.island {
         // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: <{device = "tpu0"}>
         // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]]) : (tensor<?xi32>) -> tensor<?xi32>
         %3 = "tf.A"(%arg0) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
 
@@ -111,7 +115,7 @@ module {
         %4 = "tf.B"(%3, %1#0) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: tf_device.return %[[B_OUTPUT]]
-        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+        // CHECK: : () -> tensor<?xi32>
 
         // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[TPU0_OUTPUT]])
         %5 = "tf.C"(%4) : (tensor<?xi32>) -> tensor<?xi32>
@@ -135,11 +139,12 @@ module {
       %1:2 = tf_executor.island {
 
         // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: <{device = "tpu0"}>
         // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"() : () -> tensor<?xi32>
         %3 = "tf.A"() {device = "tpu0"} : () -> tensor<?xi32>
 
         // CHECK: tf_device.return %[[A_OUTPUT]]
-        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+        // CHECK: : () -> tensor<?xi32>
 
         // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[TPU0_OUTPUT]])
         %4 = "tf.B"(%3) : (tensor<?xi32>) -> tensor<?xi32>
@@ -166,6 +171,7 @@ module {
         %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: <{device = "tpu0"}>
         // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
         %3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
 
@@ -173,7 +179,7 @@ module {
         %4 = "tf.C"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: tf_device.return %[[C_OUTPUT]]
-        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+        // CHECK: : () -> tensor<?xi32>
 
         // CHECK: %[[GPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
         // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[TPU0_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
@@ -204,6 +210,7 @@ module {
         %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: <{device = "tpu0"}>
         // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
         %3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
 
@@ -211,7 +218,7 @@ module {
         %4 = "tf.C"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: tf_device.return %[[C_OUTPUT]]
-        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
+        // CHECK: : () -> tensor<?xi32>
 
         // CHECK: %[[GPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
         // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
@@ -248,6 +255,7 @@ module {
         // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[ARG_0]])
 
         // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: <{device = "tpu0"}>
         // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
         %3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
 
@@ -257,7 +265,6 @@ module {
         %5 = "tf.D"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: tf_device.return %[[D_OUTPUT]]
-        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
 
         // CHECK: %[[E_OUTPUT:[0-9]*]] = "tf.E"(%[[C_OUTPUT]], %[[TPU0_OUTPUT]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
         %6 = "tf.E"(%4, %5) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
@@ -296,12 +303,11 @@ module {
         %4 = "tf.C"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: %[[TPU0_OUTPUT1:[0-9]*]] = "tf_device.launch"
+        // CHECK: <{device = "tpu0"}>
         // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[A_OUTPUT]], %[[TPU0_OUTPUT0]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
         // CHECK: tf_device.return %[[D_OUTPUT]]
         %5 = "tf.D"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
 
-        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
-
         // CHECK: %[[E_OUTPUT:[0-9]*]] = "tf.E"(%[[C_OUTPUT]], %[[TPU0_OUTPUT1]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
         %6 = "tf.E"(%4, %5) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
 
@@ -358,11 +364,12 @@ module {
         %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: %[[GPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: <{device = "gpu0"}>
         // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[ARG_0]])
         // CHECK: tf_device.return %[[C_OUTPUT]]
-        // CHECK: {device = "gpu0"} : () -> tensor<?xi32>
 
         // CHECK: %[[TPU0_OUTPUT:[0-9]*]] = "tf_device.launch"
+        // CHECK: <{device = "tpu0"}>
         // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[A_OUTPUT]]) : (tensor<?xi32>) -> tensor<?xi32>
         %3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
 
@@ -372,7 +379,6 @@ module {
         %5 = "tf.D"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
 
         // CHECK: tf_device.return %[[D_OUTPUT]]
-        // CHECK: {device = "tpu0"} : () -> tensor<?xi32>
 
         // CHECK: %[[E_OUTPUT:[0-9]*]] = "tf.E"(%[[GPU0_OUTPUT]], %[[TPU0_OUTPUT]]) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
         %6 = "tf.E"(%4, %5) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir b/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir
index a77e449b03de90..90f1cfc2fd5027 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/cluster_outlining.mlir
@@ -10,7 +10,7 @@ func.func @single_cluster(%arg0: tensor<?xi32>) -> tensor<?xi32> {
       // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
       %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
-      // CHECK: %[[CLUSTER_OUTPUT:[0-9]*]] = "tf_device.cluster_func"(%[[A_OUTPUT]]) {func = @[[CLUSTER:.*]]}
+      // CHECK: %[[CLUSTER_OUTPUT:[0-9]*]] = "tf_device.cluster_func"(%[[A_OUTPUT]]) <{func = @[[CLUSTER:.*]]}>
       %3 = "tf_device.cluster"() ({
         %4 = "tf.B"(%2) : (tensor<?xi32>) -> tensor<?xi32>
         tf_device.return %4 : tensor<?xi32>
@@ -42,7 +42,7 @@ func.func @multiple_clusters(%arg0: tensor<?xi32>) -> tensor<?xi32> {
       // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
       %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
-      // CHECK: %[[CLUSTER_0_OUTPUT:[0-9]*]] = "tf_device.cluster_func"(%[[A_OUTPUT]]) {func = @[[CLUSTER_0:.*]]}
+      // CHECK: %[[CLUSTER_0_OUTPUT:[0-9]*]] = "tf_device.cluster_func"(%[[A_OUTPUT]]) <{func = @[[CLUSTER_0:.*]]}>
       %3 = "tf_device.cluster"() ({
         %6 = "tf.B"(%2) : (tensor<?xi32>) -> tensor<?xi32>
         tf_device.return %6 : tensor<?xi32>
@@ -51,7 +51,7 @@ func.func @multiple_clusters(%arg0: tensor<?xi32>) -> tensor<?xi32> {
       // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[CLUSTER_0_OUTPUT]])
       %4 = "tf.D"(%3) : (tensor<?xi32>) -> tensor<?xi32>
 
-      // CHECK: %[[CLUSTER_1_OUTPUT:[0-9]*]] = "tf_device.cluster_func"(%[[CLUSTER_0_OUTPUT]], %[[D_OUTPUT]]) {func = @[[CLUSTER_1:.*]]}
+      // CHECK: %[[CLUSTER_1_OUTPUT:[0-9]*]] = "tf_device.cluster_func"(%[[CLUSTER_0_OUTPUT]], %[[D_OUTPUT]]) <{func = @[[CLUSTER_1:.*]]}>
       %5 = "tf_device.cluster"() ({
         %6 = "tf.E"(%3) : (tensor<?xi32>) -> tensor<?xi32>
         %7 = "tf.F"(%4, %6) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
@@ -86,7 +86,7 @@ func.func @multiple_clusters(%arg0: tensor<?xi32>) -> tensor<?xi32> {
 func.func @cluster_operands(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = tf_executor.graph {
     %1:2 = tf_executor.island wraps
-      // CHECK: %[[CLUSTER_OUTPUT:[a-z0-9]*]], %{{.*}} = {{.*}} "tf_device.cluster_func"() {func = @[[CLUSTER:.*]]}
+      // CHECK: %[[CLUSTER_OUTPUT:[a-z0-9]*]], %{{.*}} = {{.*}} "tf_device.cluster_func"() <{func = @[[CLUSTER:.*]]}>
       "tf_device.cluster"() ({
         %3 = "tf.A"() : () -> tensor<?xi32>
         tf_device.return %3 : tensor<?xi32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/device_attribute_to_launch.mlir b/tensorflow/compiler/mlir/tensorflow/tests/device_attribute_to_launch.mlir
index 3384c6529067db..499688411f55ac 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/device_attribute_to_launch.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/device_attribute_to_launch.mlir
@@ -4,10 +4,10 @@
 // CHECK-LABEL: func @single_op_launch
 func.func @single_op_launch() {
   // CHECK: "tf_device.launch"
+  // CHECK: device = "CPU:0"
   // CHECK: "tf.opA"
   // CHECK-NOT device
   // CHECK: tf_device.return
-  // CHECK: device = "CPU:0"
   "tf.opA"() {device = "CPU:0"} : () -> tensor<i1>
   func.return
 }
@@ -16,10 +16,10 @@ func.func @single_op_launch() {
 // CHECK-LABEL: func @launch_return
 func.func @launch_return() -> tensor<i1> {
   // CHECK: %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+  // CHECK: device = "CPU:0"
   // CHECK: %[[A_OUT:.*]] = "tf.opA"
   // CHECK-NOT device
   // CHECK: tf_device.return %[[A_OUT]]
-  // CHECK: device = "CPU:0"
   // CHECK: return %[[LAUNCH_OUT]]
   %a = "tf.opA"() {device = "CPU:0"} : () -> tensor<i1>
   func.return %a : tensor<i1>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/extract_head_tail_outside_compilation.mlir
index 5f0821a0271092..5f48061760a51f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/extract_head_tail_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/extract_head_tail_outside_compilation.mlir
@@ -6,10 +6,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   // CHECK-LABEL: func @head_single_outside_compiled_op
   func.func @head_single_outside_compiled_op(%arg0: tensor<i32>) {
     // CHECK:      "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   "tf.A"
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   tf_device.return
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   "tf.B"
@@ -27,10 +27,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   // CHECK-LABEL: func @head_single_outside_compiled_op_no_operands
   func.func @head_single_outside_compiled_op_no_operands() {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   tf_device.return %[[A_OUT]]
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   "tf.B"(%[[LAUNCH_OUT]])
@@ -50,10 +50,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:      %[[A_OUT:.*]] = "tf.A"
     %a = "tf.A"() : () -> tensor<i32>
     // CHECK-NEXT: %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   tf_device.return %[[B_OUT]]
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   "tf.C"(%[[LAUNCH_OUT]])
@@ -71,10 +71,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   // CHECK-LABEL: func @head_aliased_output
   func.func @head_aliased_output() -> (tensor<i32>, tensor<i32>, tensor<i32>) {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   tf_device.return %[[A_OUT]]
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      %[[CLUSTER_OUT:.*]]:2 = "tf_device.cluster"
     // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[LAUNCH_OUT]])
@@ -98,6 +98,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   // CHECK-LABEL: func @head_all_cluster_op
   func.func @head_all_cluster_op(%arg0: tensor<i32>) -> tensor<i32> {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
@@ -105,7 +106,6 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"(%[[B_OUT]], %arg0)
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   tf_device.return %[[C_OUT]]
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   tf_device.return
@@ -122,6 +122,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   // CHECK-LABEL: func @head_multiple_outside_compiled_ops
   func.func @head_multiple_outside_compiled_ops(%arg0: tensor<i32>) {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
@@ -129,7 +130,6 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:   "tf.C"
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   tf_device.return %[[B_OUT]]
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   "tf.D"(%[[LAUNCH_OUT]])
@@ -149,10 +149,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:      tf_device.replicate([%arg0, %arg1] as %[[RI:.*]]: tensor<i32>)
     //
     // CHECK-NEXT:   %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK-SAME:   device = "TPU_REPLICATED_HOST_0"
     // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.A"(%[[RI]])
     // CHECK-NOT:      _xla_outside_compilation
     // CHECK-NEXT:     tf_device.return %[[A_OUT]]
-    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST_0"
     //
     // CHECK:        "tf_device.cluster"
     // CHECK-NEXT:     "tf.B"(%[[LAUNCH_OUT]])
@@ -215,10 +215,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-DAG:  device_assignment = []
     //
     // CHECK:      "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   "tf.B"(%[[CLUSTER_OUT]])
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   tf_device.return
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     "tf_device.cluster"() ({
       %a = "tf.A"() : () -> tensor<i32>
       "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
@@ -241,10 +241,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-DAG:  device_assignment = []
     //
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[CLUSTER_OUT]])
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   tf_device.return %[[B_OUT]]
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     %cluster = "tf_device.cluster"() ({
       %a = "tf.A"() : () -> tensor<i32>
       %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
@@ -268,12 +268,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-DAG:  device_assignment = []
     //
     // CHECK:      "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"(%arg0, %[[CLUSTER_OUT]]#1)
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   "tf.D"(%[[C_OUT]], %arg0, %[[CLUSTER_OUT]]#0)
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   tf_device.return
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     "tf_device.cluster"() ({
       %a = "tf.A"() : () -> tensor<i32>
       %b = "tf.B"(%arg0) : (tensor<i32>) -> tensor<i32>
@@ -299,13 +299,13 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-DAG:  device_assignment = []
     //
     // CHECK:      "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"(%arg0, %[[CLUSTER_OUT]]#2)
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK         "tf.IfRegion"
     // CHECK:          "tf.D"(%[[C_OUT]], %arg0, %[[CLUSTER_OUT]]#0)
     // CHECK-NOT:      _xla_outside_compilation
     // CHECK:        tf_device.return
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     "tf_device.cluster"() ({
       %0 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
       %a = "tf.A"() : () -> tensor<i32>
@@ -339,10 +339,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-DAG:  device_assignment = []
     //
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   %[[D_OUT:.*]] = "tf.D"(%[[CLUSTER_OUT]]#0, %[[A_OUT]])
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   tf_device.return
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     %cluster:5 = "tf_device.cluster"() ({
       %c = "tf.C"()  : () -> tensor<i32>
       %d = "tf.D"(%c, %a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
@@ -367,10 +367,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-DAG:    device_assignment = []
     //
     // CHECK-NEXT:   "tf_device.launch"()
+    // CHECK-SAME:   device = "TPU_REPLICATED_HOST_0"
     // CHECK-NEXT:     %[[B_OUT:.*]] = "tf.B"(%[[CLUSTER_OUT]], %[[RI]])
     // CHECK-NOT:      _xla_outside_compilation
     // CHECK-NEXT:     tf_device.return
-    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST_0"
     tf_device.replicate([%arg0, %arg1] as %ri : tensor<i32>) {n = 2 : i32} {
       "tf_device.cluster"() ({
         %a = "tf.A"(%ri) : (tensor<i32>) -> tensor<i32>
@@ -402,10 +402,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   // CHECK-LABEL: func @head_tail_simple_extraction
   func.func @head_tail_simple_extraction(%arg0: tensor<i32>) -> tensor<i32> {
     // CHECK:      %[[HEAD_LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"(%arg0)
     // CHECK-NOT:      _xla_outside_compilation
     // CHECK-NEXT:   tf_device.return %[[A_OUT]]
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
     // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[HEAD_LAUNCH_OUT]])
@@ -417,10 +417,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-DAG:  device_assignment = []
     //
     // CHECK:      %[[TAIL_LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"(%[[CLUSTER_OUT]])
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   tf_device.return %[[C_OUT]]
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     %cluster = "tf_device.cluster"() ({
       %a = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
       %b = "tf.B"(%a) : (tensor<i32>) -> tensor<i32>
@@ -436,10 +436,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:      tf_device.replicate([%arg0, %arg1] as %[[RI:.*]]: tensor<i32>)
     //
     // CHECK-NEXT:   %[[HEAD_LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK-SAME:   device = "TPU_REPLICATED_HOST_0"
     // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.A"(%[[RI]])
     // CHECK-NOT:      _xla_outside_compilation
     // CHECK-NEXT:     tf_device.return %[[A_OUT]]
-    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST_0"
     //
     // CHECK:        %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
     // CHECK-NEXT:     %[[B_OUT:.*]] = "tf.B"
@@ -453,10 +453,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-DAG:    device_assignment = []
     //
     // CHECK-NEXT:   "tf_device.launch"()
+    // CHECK-SAME:   device = "TPU_REPLICATED_HOST_0"
     // CHECK-NEXT:     "tf.D"(%[[HEAD_LAUNCH_OUT]], %[[CLUSTER_OUT]], %[[RI]])
     // CHECK-NOT:      _xla_outside_compilation
     // CHECK-NEXT:     tf_device.return
-    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST_0"
     tf_device.replicate([%arg0, %arg1] as %ri : tensor<i32>) {n = 2 : i32} {
       "tf_device.cluster"() ({
         %a = "tf.A"(%ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
@@ -490,10 +490,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   // CHECK-LABEL: func @side_effect_head_no_operand
   func.func @side_effect_head_no_operand() {
     // CHECK:      %[[HEAD_LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   "tf.B"
     // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"
     // CHECK-NEXT:   tf_device.return %[[C_OUT]]
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
 
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   "tf.Const"
@@ -518,10 +518,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:   tf_device.return %[[A_OUT]]
 
     // CHECK:      "tf_device.launch"()
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   "tf.B"(%[[CLUSTER_OUT]])
     // CHECK-NEXT:   "tf.C"
     // CHECK-NEXT:   tf_device.return
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     "tf_device.cluster"() ({
       %a = "tf.A"() : () -> tensor<i32>
       "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
@@ -538,10 +538,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   // CHECK-LABEL: func @embedding_head_extraction
   func.func @embedding_head_extraction(%arg0: tensor<!tf_type.string>) {
     // CHECK:      "tf_device.launch"()
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   "tf.EnqueueTPUEmbeddingRaggedTensorBatch"
     // CHECK-NEXT:   "tf.EnqueueTPUEmbeddingArbitraryTensorBatch"
     // CHECK-NEXT:   tf_device.return
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
 
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   "tf.UnknownOp"
@@ -560,9 +560,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   // CHECK-LABEL: func @op_after_embedding_head_extraction
   func.func @op_after_embedding_head_extraction() {
     // CHECK:      "tf_device.launch"()
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   "tf.A"
     // CHECK-NEXT:   tf_device.return
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
 
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   "tf.RecvTPUEmbeddingActivations"
@@ -588,9 +588,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:   tf_device.return
 
     // CHECK:      "tf_device.launch"()
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   "tf.A"
     // CHECK-NEXT:   tf_device.return
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     "tf_device.cluster"() ({
       "tf.UnknownOp"() : () -> ()
       "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> ()
@@ -607,10 +607,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
   // CHECK-LABEL: func @head_single_outside_compiled_op_in_generic_pipeline
   func.func @head_single_outside_compiled_op_in_generic_pipeline(%arg0: tensor<i32>) {
     // CHECK:      "tf_device.launch"
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   "tf.A"
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT:   tf_device.return
-    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   "tf.B"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir
index 87acd459ed1350..cbd9942a0f8a73 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir
@@ -32,10 +32,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   func.func @nodep_single_outside_compilation() -> () {
      // CHECK: "tf_device.parallel_execute"
      // CHECK-NEXT: "tf_device.launch"
+     // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
      // CHECK-NEXT: "tf.B"
      // CHECK-NOT: _xla_outside_compilation
      // CHECK-NEXT:   tf_device.return
-     // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
      // CHECK: "tf_device.cluster"
      // CHECK-NEXT: "tf.A"
      // CHECK: device_assignment =  [], num_cores_per_replica = 1 : i64, topology =  ""
@@ -102,9 +102,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
     // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-SAME:     device = "TPU_REPLICATED_HOST_0"
     // CHECK:            "tf.B"
     // CHECK-NEXT:       tf_device.return
-    // CHECK-NEXT:     device = "TPU_REPLICATED_HOST_0"
     // CHECK:          %[[TPU_CLUSTER_OUTPUT:[0-9]*]] = "tf_device.cluster"
     // CHECK:            tf_device.return
     // CHECK:          tf_device.return %[[TPU_CLUSTER_OUTPUT]]
@@ -1839,10 +1839,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   func.func @outside_compilation_model_parallelism() -> () {
      // CHECK: "tf_device.parallel_execute"
      // CHECK-NEXT: "tf_device.launch"
+     // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
      // CHECK-NEXT: "tf.B"
      // CHECK-NOT: _xla_outside_compilation
      // CHECK-NEXT:   tf_device.return
-     // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
      // CHECK: "tf_device.cluster"
      // CHECK-NEXT: "tf.A"
      // CHECK: num_cores_per_replica = 2 : i64
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/extract_tpu_copy_with_dynamic_shape_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/extract_tpu_copy_with_dynamic_shape_op.mlir
index 2c2b36c59121d3..ec3fedf987ab37 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/extract_tpu_copy_with_dynamic_shape_op.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/extract_tpu_copy_with_dynamic_shape_op.mlir
@@ -6,10 +6,10 @@ module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:COMPOSI
   // CHECK-LABEL: func @valid_copy_op_in_replicated_host
 
   // CHECK: "tf_device.launch"
-  // CHECK: "TPU_REPLICATED_HOST_0"
+  // CHECK-SAME: "TPU_REPLICATED_HOST_0"
   // CHECK: "tf_device.launch"
+  // CHECK-SAME: "TPU_REPLICATED_CORE_0"
   // CHECK: "tf.TPUCopyWithDynamicShape"
-  // CHECK: "TPU_REPLICATED_CORE_0"
   func.func @valid_copy_op_in_replicated_host(
     %arg0: tensor<2048xi64> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"},
     %arg1: tensor<2048xi64> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}) -> (tensor<2048xi32>, tensor<2048xi32>) {
@@ -26,10 +26,10 @@ module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:COMPOSI
   // CHECK-LABEL: func @valid_copy_op_in_non_replicated_host
 
   // CHECK: "tf_device.launch"
-  // CHECK: "/job:localhost/replica:0/task:0/device:CPU:0"
+  // CHECK-SAME: "/job:localhost/replica:0/task:0/device:CPU:0"
   // CHECK: "tf_device.launch"
+  // CHECK-SAME: "/job:localhost/replica:0/task:0/device:TPU:0"
   // CHECK: "tf.TPUCopyWithDynamicShape"
-  // CHECK: "/job:localhost/replica:0/task:0/device:TPU:0"
   func.func @valid_copy_op_in_non_replicated_host(
     %arg0: tensor<2048xi64> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"},
     %arg1: tensor<2048xi64> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}) -> (tensor<2048xi32>, tensor<2048xi32>) {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/host_launch_to_outside_compiled.mlir b/tensorflow/compiler/mlir/tensorflow/tests/host_launch_to_outside_compiled.mlir
index b6a0a2b48567d3..d7867332a4812c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/host_launch_to_outside_compiled.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/host_launch_to_outside_compiled.mlir
@@ -28,9 +28,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   func.func @single_op_launch_not_host() -> () {
     // CHECK:      "tf.A"
     // CHECK:      "tf_device.launch"
+    // CHECK-SAME:      device = "/job:worker/replica:0/task:0/device:TPU:0"
     // CHECK:        "tf.B"
     // CHECK-NOT:    _xla_outside_compilation
-    // CHECK:      device = "/job:worker/replica:0/task:0/device:TPU:0"
     // CHECK:      "tf.C"
     // CHECK-NEXT: tf_device.return
     "tf_device.cluster"() ({
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/launch_outlining.mlir b/tensorflow/compiler/mlir/tensorflow/tests/launch_outlining.mlir
index 84825bae4aaea4..91d58dff11f5b8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/launch_outlining.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/launch_outlining.mlir
@@ -10,7 +10,7 @@ func.func @single_launch(%arg0: tensor<?xi32>) -> tensor<?xi32> {
       // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
       %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
-      // CHECK: %[[LAUNCH_OUTPUT:[0-9]*]] = "tf_device.launch_func"(%[[A_OUTPUT]]) {device = "/device:test_device:0", func = @[[LAUNCH:.*]]}
+      // CHECK: %[[LAUNCH_OUTPUT:[0-9]*]] = "tf_device.launch_func"(%[[A_OUTPUT]]) <{device = "/device:test_device:0", func = @[[LAUNCH:.*]]}>
       %3 = "tf_device.launch"() ({
         %4 = "tf.B"(%2) : (tensor<?xi32>) -> tensor<?xi32>
         tf_device.return %4 : tensor<?xi32>
@@ -42,7 +42,7 @@ func.func @multiple_launches(%arg0: tensor<?xi32>) -> tensor<?xi32> {
       // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"(%[[ARG_0]])
       %2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
 
-      // CHECK: %[[LAUNCH_0_OUTPUT:[0-9]*]] = "tf_device.launch_func"(%[[A_OUTPUT]]) {device = "/device:test_device:0", func = @[[LAUNCH_0:.*]]}
+      // CHECK: %[[LAUNCH_0_OUTPUT:[0-9]*]] = "tf_device.launch_func"(%[[A_OUTPUT]]) <{device = "/device:test_device:0", func = @[[LAUNCH_0:.*]]}>
       %3 = "tf_device.launch"() ({
         %6 = "tf.B"(%2) : (tensor<?xi32>) -> tensor<?xi32>
         tf_device.return %6 : tensor<?xi32>
@@ -51,7 +51,7 @@ func.func @multiple_launches(%arg0: tensor<?xi32>) -> tensor<?xi32> {
       // CHECK: %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[LAUNCH_0_OUTPUT]])
       %4 = "tf.D"(%3) : (tensor<?xi32>) -> tensor<?xi32>
 
-      // CHECK: %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch_func"(%[[LAUNCH_0_OUTPUT]], %[[D_OUTPUT]]) {device = "/device:test_device:0", func = @[[LAUNCH_1:.*]]}
+      // CHECK: %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch_func"(%[[LAUNCH_0_OUTPUT]], %[[D_OUTPUT]]) <{device = "/device:test_device:0", func = @[[LAUNCH_1:.*]]}>
       %5 = "tf_device.launch"() ({
         %6 = "tf.E"(%3) : (tensor<?xi32>) -> tensor<?xi32>
         %7 = "tf.F"(%4, %6) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
@@ -86,7 +86,7 @@ func.func @multiple_launches(%arg0: tensor<?xi32>) -> tensor<?xi32> {
 func.func @launch_operands(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = tf_executor.graph {
     %1:2 = tf_executor.island wraps
-      // CHECK: %[[LAUNCH_OUTPUT:[a-z0-9]*]], %{{.*}} = {{.*}} "tf_device.launch_func"() {device = "/device:test_device:0", func = @[[LAUNCH:.*]]}
+      // CHECK: %[[LAUNCH_OUTPUT:[a-z0-9]*]], %{{.*}} = {{.*}} "tf_device.launch_func"() <{device = "/device:test_device:0", func = @[[LAUNCH:.*]]}>
       "tf_device.launch"() ({
         %3 = "tf.A"() : () -> tensor<?xi32>
         tf_device.return %3 : tensor<?xi32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/outside_compiled_to_host_launch.mlir b/tensorflow/compiler/mlir/tensorflow/tests/outside_compiled_to_host_launch.mlir
index 2f744534abd3e9..c0230b43d1db04 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/outside_compiled_to_host_launch.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/outside_compiled_to_host_launch.mlir
@@ -22,10 +22,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   func.func @nodep_single_outside_compilation() -> () {
     // CHECK:      "tf.A"
     // CHECK:      "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   "tf.B"
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT: tf_device.return
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK: device_assignment =  [], num_cores_per_replica = 1 : i64, topology =  ""
     "tf_device.cluster"() ({
       "tf.A"() : () -> ()
@@ -45,10 +45,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:   "tf_device.cluster"
     // CHECK-NEXT:     "tf.B"
     // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-SAME:     device = "TPU_REPLICATED_HOST_0"
     // CHECK-NEXT:       "tf.C"
     // CHECK-NOT:        _xla_outside_compilation
     // CHECK:            tf_device.return
-    // CHECK-NEXT:     device = "TPU_REPLICATED_HOST_0"
     // CHECK: device_assignment =  [], num_cores_per_replica = 1 : i64, topology =  ""
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
@@ -136,10 +136,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   func.func @called_outside_compilation_callee() -> () {
     // CHECK:      "tf.A"
     // CHECK:      "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   "tf.B"
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT: tf_device.return
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     "tf.A"() : () -> ()
     "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
     "tf.C"() : () -> ()
@@ -178,10 +178,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   func.func @outside_compilation_model_parallelism() -> () {
     // CHECK:      "tf.A"
     // CHECK:      "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   "tf.B"
     // CHECK-NOT:    _xla_outside_compilation
     // CHECK-NEXT: tf_device.return
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK: num_cores_per_replica = 2 : i64
     %0 = "tf_device.cluster"() ({
       "tf.A"() : () -> ()
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_invariant_op_hoisting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_invariant_op_hoisting.mlir
index 024caf9297bd3e..ec30a7bdc906d3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/replicate_invariant_op_hoisting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_invariant_op_hoisting.mlir
@@ -156,19 +156,19 @@ func.func @nested_ops(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) {
 
 // CHECK:      %[[SHAPE:[0-9]*]] = "tf.Shape"(%[[ARG_0]])
 // CHECK-NEXT: %[[LAUNCH_A:[0-9]*]] = "tf_device.launch"
+// CHECK-SAME: device = "a"
 // CHECK-NEXT:   %[[OP_A:[0-9]*]] = "tf.opA"(%[[SHAPE]])
 // CHECK-NEXT:   tf_device.return %[[OP_A]]
-// CHECK-NEXT: device = "a"
-// CHECK-NEXT: %[[LAUNCH_B:[0-9]*]] = "tf_device.launch"
+// CHECK:      %[[LAUNCH_B:[0-9]*]] = "tf_device.launch"
+// CHECK-SAME: device = "b"
 // CHECK-NEXT:   %[[OP_B:[0-9]*]] = "tf.opB"(%[[SHAPE]], %[[LAUNCH_A]])
 // CHECK-NEXT:   tf_device.return %[[OP_B]]
-// CHECK-NEXT: device = "b"
-// CHECK-NEXT: tf_device.replicate([{{.*}}] as %[[RI:[a-z0-9]+]]: tensor<*xf32>)
+// CHECK: tf_device.replicate([{{.*}}] as %[[RI:[a-z0-9]+]]: tensor<*xf32>)
 // CHECK-NEXT:   %[[LAUNCH_C:[0-9]*]] = "tf_device.launch"
+// CHECK-SAME:   device = "c"
 // CHECK-NEXT:     %[[OP_C:[0-9]*]] = "tf.opC"(%[[RI]], %[[LAUNCH_B]])
 // CHECK-NEXT:     tf_device.return %[[OP_C]]
-// CHECK-NEXT:   device = "c"
-// CHECK-NEXT:   tf_device.return %[[SHAPE]], %[[LAUNCH_A]], %[[LAUNCH_B]], %[[LAUNCH_C]]
+// CHECK:   tf_device.return %[[SHAPE]], %[[LAUNCH_A]], %[[LAUNCH_B]], %[[LAUNCH_C]]
 
 
 // CHECK-LABEL:   func @do_not_hoist_ops_with_virtual_device
@@ -193,14 +193,14 @@ func.func @do_not_hoist_ops_with_virtual_device(%arg0: tensor<*xf32>, %arg1: ten
 // CHECK:  [[SHAPE:%.*]] = "tf.Shape"([[VAL_0]])
 // CHECK:  tf_device.replicate({{\[}}[[VAL_0]], [[VAL_1]]] as [[VAL_4:%.*]]: tensor<*xf32>) {devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}, n = 2 : i32} {
 // CHECK:    [[OP_A:%.*]] = "tf.opA"([[SHAPE]]) {device = "TPU_REPLICATED_CORE_0"} : (tensor<?xi32>) -> tensor<*xi32>
-// CHECK:    [[LAUNCH_B:%.*]] = "tf_device.launch"() ({
+// CHECK:    [[LAUNCH_B:%.*]] = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_0"}> ({
 // CHECK:      [[OP_B:%.*]] = "tf.opB"([[SHAPE]]) : (tensor<?xi32>) -> tensor<*xi32>
 // CHECK:      tf_device.return [[OP_B]] : tensor<*xi32>
-// CHECK:    }) {device = "TPU_REPLICATED_CORE_0"} : () -> tensor<*xi32>
-// CHECK:    [[LAUNCH_C:%.*]] = "tf_device.launch"() ({
+// CHECK:    }) : () -> tensor<*xi32>
+// CHECK:    [[LAUNCH_C:%.*]] = "tf_device.launch"() <{device = "c"}> ({
 // CHECK:      [[OP_C:%.*]] = "tf.opC"([[SHAPE]]) {device = "TPU_REPLICATED_CORE_0"} : (tensor<?xi32>) -> tensor<*xi32>
 // CHECK:      tf_device.return [[OP_C]] : tensor<*xi32>
-// CHECK:    }) {device = "c"} : () -> tensor<*xi32>
+// CHECK:    }) : () -> tensor<*xi32>
 // CHECK:    tf_device.return [[SHAPE]], [[OP_A]], [[LAUNCH_B]], [[LAUNCH_C]]
 
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
index 8fec2a5bb55223..a27a0ff5785ae2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
@@ -44,9 +44,11 @@ func.func @no_devices() {
 }
 
 // CHECK: "tf.opA"
-// CHECK: _parallel_execution_ids = "r0:0", device = "CORE_0"
+// device = "CORE_0"
+// CHECK: _parallel_execution_ids = "r0:0"
 // CHECK: "tf.opA"
-// CHECK: _parallel_execution_ids = "r0:1", device = "CORE_0"
+// device = "CORE_0"
+// CHECK: _parallel_execution_ids = "r0:1"
 
 
 // Tests devices are not remapped if device is not in replicate devices.
@@ -69,9 +71,11 @@ func.func @no_override_device() {
 }
 
 // CHECK: "tf.opA"
-// CHECK: _parallel_execution_ids = "r0:0", device = "/TPU:2"
+// device = "/TPU:2"
+// CHECK: _parallel_execution_ids = "r0:0"
 // CHECK: "tf.opA"
-// CHECK: _parallel_execution_ids = "r0:1", device = "/TPU:2"
+// device = "/TPU:2"
+// CHECK: _parallel_execution_ids = "r0:1"
 
 
 // Tests devices are remapped if device is in replicate devices.
@@ -94,9 +98,11 @@ func.func @remap_device() {
 }
 
 // CHECK: "tf.opA"
-// CHECK: _parallel_execution_ids = "r0:0", device = "/CPU:0"
+// device = "/CPU:0"
+// CHECK: _parallel_execution_ids = "r0:0"
 // CHECK: "tf.opA"
-// CHECK: _parallel_execution_ids = "r0:1", device = "/GPU:1"
+// device = "/GPU:1"
+// CHECK: _parallel_execution_ids = "r0:1"
 
 
 // Tests replicate with control dependency output has each expanded replica
@@ -305,20 +311,20 @@ func.func @nested_parallel_execute(%arg0: tensor<i1>, %arg1: tensor<i1>) {
 // CHECK:      tf_executor.island
 // CHECK:      tf_device.parallel_execute
 // CHECK:      tf_device.launch
+// CHECK:      <{device = "/TPU:1"}>
 // CHECK:      tf.OpA
-// CHECK:      {device = "/TPU:1"}
 // CHECK:      tf_device.launch
+// CHECK:      <{device = "/TPU:2"}>
 // CHECK:      tf.OpB
-// CHECK:      {device = "/TPU:2"}
 // CHECK:      _parallel_execution_ids = "r0:0"
 // CHECK:      tf_executor.island
 // CHECK:      tf_device.parallel_execute
 // CHECK:      tf_device.launch
+// CHECK:      <{device = "/TPU:1"}>
 // CHECK:      tf.OpA
-// CHECK:      {device = "/TPU:1"}
 // CHECK:      tf_device.launch
+// CHECK:      <{device = "/TPU:2"}>
 // CHECK:      tf.OpB
-// CHECK:      {device = "/TPU:2"}
 // CHECK:      _parallel_execution_ids = "r0:1"
 // CHECK:      tf_executor.fetch
 
@@ -343,9 +349,11 @@ func.func @merge_of_parallel_group_attr() {
 }
 
 // CHECK: "tf.opA"
-// CHECK: _parallel_execution_ids = "r4:5,r0:0", device = "/CPU:0"
+// device = "/CPU:0"
+// CHECK: _parallel_execution_ids = "r4:5,r0:0"
 // CHECK: "tf.opA"
-// CHECK: _parallel_execution_ids = "r4:5,r0:1", device = "/GPU:1"
+// device = "/GPU:1"
+// CHECK: _parallel_execution_ids = "r4:5,r0:1"
 
 // -----
 
@@ -418,10 +426,14 @@ func.func @no_override_device_new() {
   func.return
 }
 // CHECK: "tf.opA"
-// CHECK: _parallel_execution_ids = "r0:0", device = "/TPU:0"
+// device = "/TPU:0"
+// CHECK: _parallel_execution_ids = "r0:0"
 // CHECK: "tf.opA"
-// CHECK: _parallel_execution_ids = "r0:1", device = "/TPU:0"
+// device = "/TPU:0"
+// CHECK: _parallel_execution_ids = "r0:1"
 // CHECK: "tf.opA"
-// CHECK: _parallel_execution_ids = "r1:0", device = "/TPU:1"
+// device = "/TPU:1"
+// CHECK: _parallel_execution_ids = "r1:0"
 // CHECK: "tf.opA"
-// CHECK: _parallel_execution_ids = "r1:1", device = "/TPU:1"
+// device = "/TPU:1"
+// CHECK: _parallel_execution_ids = "r1:1"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island_legacy.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island_legacy.mlir
index 24d498ebe88601..2c47b0835d7459 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island_legacy.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island_legacy.mlir
@@ -43,10 +43,12 @@ func.func @no_devices() {
   func.return
 }
 
-// CHECK: "tf.opA"
+// CHECK: "tf_device.launch"
 // CHECK: device = "CORE_0"
 // CHECK: "tf.opA"
+// CHECK: "tf_device.launch"
 // CHECK: device = "CORE_0"
+// CHECK: "tf.opA"
 
 
 // Tests devices are not remapped if device is not in replicate devices.
@@ -68,10 +70,12 @@ func.func @no_override_device() {
   func.return
 }
 
-// CHECK: "tf.opA"
+// CHECK: "tf_device.launch"
 // CHECK: device = "/TPU:2"
 // CHECK: "tf.opA"
+// CHECK: "tf_device.launch"
 // CHECK: device = "/TPU:2"
+// CHECK: "tf.opA"
 
 
 // Tests devices are remapped if device is in replicate devices.
@@ -93,10 +97,12 @@ func.func @remap_device() {
   func.return
 }
 
-// CHECK: "tf.opA"
+// CHECK: "tf_device.launch"
 // CHECK: device = "/CPU:0"
 // CHECK: "tf.opA"
+// CHECK: "tf_device.launch"
 // CHECK: device = "/GPU:1"
+// CHECK: "tf.opA"
 
 
 // Tests replicate with control dependency output has each expanded replica
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir
index 553bfa0955106b..75bf23d23519b2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-dynamic-layout-pass.mlir
@@ -429,9 +429,9 @@ func.func @parallel_execute(%arg0: tensor<*x!tf_type.resource> {tf.device = "/de
     // CHECK-NEXT: %[[COPY0:.*]] = "tf.TPUCopyWithLayout"(%[[ITER]]#0, %[[LAYOUT0]])
     // CHECK-SAME: device = "/device:TPU:0"
     // CHECK-NEXT: "tf_device.launch"
+    // CHECK-SAME: device = "/device:TPU:0"
     // CHECK-NEXT: "tf.TPUExecute"(%[[COPY0]], %[[COMPILE]]#1)
     // CHECK-NEXT: tf_device.return
-    // CHECK-NEXT: device = "/device:TPU:0"
     "tf_device.launch"() ({
       "tf.TPUExecute"(%2#0, %compile#1) : (tensor<128xf32>, tensor<2x!tf_type.string>) -> ()
       tf_device.return
@@ -442,9 +442,9 @@ func.func @parallel_execute(%arg0: tensor<*x!tf_type.resource> {tf.device = "/de
     // CHECK: %[[COPY1:.*]] = "tf.TPUCopyWithLayout"(%[[ITER]]#1, %[[LAYOUT1]])
     // CHECK-SAME: device = "/device:TPU:1"
     // CHECK-NEXT: "tf_device.launch"
+    // CHECK-SAME: device = "/device:TPU:1"
     // CHECK-NEXT: "tf.TPUExecute"(%[[COPY1]], %[[COMPILE]]#2)
     // CHECK-NEXT: tf_device.return
-    // CHECK-NEXT: device = "/device:TPU:1"
     "tf_device.launch"() ({
       "tf.TPUExecute"(%2#1, %compile#2) : (tensor<128xf32>, tensor<2x!tf_type.string>) -> ()
       tf_device.return
@@ -501,9 +501,10 @@ func.func @replicated_parallel_execute(%arg0: tensor<*x!tf_type.resource> {tf.de
   tf_device.replicate([%2#0, %3#0] as %r0: tensor<128xf32>, [%2#1, %3#1] as %r1: tensor<128xf32>) {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"], TPU_REPLICATED_CORE_1 = ["/device:TPU:2", "/device:TPU:3"]}} {
     // CHECK: "tf_device.parallel_execute"
     "tf_device.parallel_execute"() ({
+      // CHECK: "tf_device.launch"
+      // CHECK-SAME: device = "TPU_REPLICATED_CORE_0"
       // CHECK: "tf.TPUExecute"(%[[R0]], %[[COMPILE]]#1)
       // CHECK-NEXT: tf_device.return
-      // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
       "tf_device.launch"() ({
         "tf.TPUExecute"(%r0, %compile#1) : (tensor<128xf32>, tensor<2x!tf_type.string>) -> ()
         tf_device.return
@@ -511,9 +512,10 @@ func.func @replicated_parallel_execute(%arg0: tensor<*x!tf_type.resource> {tf.de
       tf_device.return
     },
     {
+      // CHECK: "tf_device.launch"
+      // CHECK-SAME: device = "TPU_REPLICATED_CORE_1"
       // CHECK: "tf.TPUExecute"(%[[R1]], %[[COMPILE]]#2)
       // CHECK-NEXT: tf_device.return
-      // CHECK-NEXT: device = "TPU_REPLICATED_CORE_1"
       "tf_device.launch"() ({
         "tf.TPUExecute"(%r1, %compile#2) : (tensor<128xf32>, tensor<2x!tf_type.string>) -> ()
         tf_device.return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-merge-variables-with-execute.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-merge-variables-with-execute.mlir
index e3191b5d9b6d6e..880703afd0fe5b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-merge-variables-with-execute.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-merge-variables-with-execute.mlir
@@ -27,6 +27,7 @@ func.func @merge_same_device_variables(
       tf_device.return %0#0, %0#1 : tensor<!tf_type.string>, tensor<2x!tf_type.string>
     }) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
   // CHECK: %[[EXE:.*]] = "tf_device.launch"
+  // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:TPU:0"}>
   // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[ID_0]], %[[ARG_1]], %[[READ_2]], %[[COMPILE]]#1)
   // CHECK-SAME: device_var_reads_indices = [0, 1],
   // CHECK-SAME: device_var_updates_indices = [0, -1]
@@ -38,7 +39,7 @@ func.func @merge_same_device_variables(
     tf_device.return %0#0, %0#1 : tensor<32xf32>, tensor<16xf32>
   }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> (tensor<32xf32>, tensor<16xf32>)
   // CHECK-NEXT: tf_device.return
-  // CHECK-NEXT: }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
+  // CHECK-NEXT: })
   "tf.AssignVariableOp"(%id0, %execute#0) : (tensor<*x!tf_type.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_2]], %[[EXE]])
   "tf.AssignVariableOp"(%arg2, %execute#1) : (tensor<*x!tf_type.resource<tensor<16xf32>>>, tensor<16xf32>) -> ()
@@ -71,6 +72,7 @@ func.func @merge_replicated_variables(
   // CHECK: tf_device.replicate([%[[ARG_1]], %[[ARG_2]]] as %[[R_ARG:.*]]: tensor<*x!tf_type.resource<tensor<32xf32>>>)
   tf_device.replicate([%arg1, %arg2] as %r: tensor<*x!tf_type.resource<tensor<32xf32>>>) {n = 2 : i32} {
     // CHECK-NEXT: "tf_device.launch"
+    // CHECK-SAME: <{device = ""}>
     // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[READ_0]], %[[R_ARG]], %[[COMPILE]]#1)
     // CHECK-SAME: device_var_reads_indices = [1],
     // CHECK-SAME: device_var_updates_indices = [0]
@@ -81,7 +83,7 @@ func.func @merge_replicated_variables(
       tf_device.return %0 : tensor<32xf32>
     }) {device = ""} : () -> tensor<32xf32>
     // CHECK-NEXT: tf_device.return
-    // CHECK-NEXT: }) {device = ""}
+    // CHECK-NEXT: })
     "tf.AssignVariableOp"(%r, %execute) : (tensor<*x!tf_type.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
     // CHECK-NEXT: tf_device.return
     tf_device.return
@@ -130,6 +132,7 @@ func.func @interfering_accesses(
     tf_device.return %0#0, %0#1 : tensor<!tf_type.string>, tensor<2x!tf_type.string>
   }) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
   // CHECK: %[[EXE:.*]]:2 = "tf_device.launch"
+  // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:TPU:0"}>
   // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[READ_0]], %[[ARG_1]], %[[ARG_4]], %[[READ_5]], %[[COMPILE]]#1)
   // CHECK-SAME: device_var_reads_indices = [1, 2],
   // CHECK-SAME: device_var_updates_indices = [1, -1]
@@ -142,7 +145,7 @@ func.func @interfering_accesses(
     tf_device.return %0#0, %0#1, %0#2 : tensor<32xf32>, tensor<64xf32>, tensor<8xf32>
   }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> (tensor<32xf32>, tensor<64xf32>, tensor<8xf32>)
   // CHECK-NEXT: tf_device.return
-  // CHECK-NEXT: }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
+  // CHECK-NEXT: })
   "tf.AssignVariableOp"(%arg1, %execute#1) : (tensor<*x!tf_type.resource<tensor<64xf32>>>, tensor<64xf32>) -> ()
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_0]], %[[EXE]]#0)
   "tf.AssignVariableOp"(%arg0, %execute#0) : (tensor<*x!tf_type.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
@@ -197,6 +200,7 @@ func.func @non_interfering_accesses(
     tf_device.return %0#0, %0#1 : tensor<!tf_type.string>, tensor<2x!tf_type.string>
   }) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
   // CHECK: %[[EXE:.*]] = "tf_device.launch"
+  // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:TPU:0"}>
   // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[ARG_0]], %[[ARG_1]], %[[ARG_3]], %[[ARG_4]], %[[COMPILE]]#1)
   // CHECK-SAME: device_var_reads_indices = [0, 1, 2, 3],
   // CHECK-SAME: device_var_updates_indices = [0, 1, -1, -1]
@@ -209,7 +213,7 @@ func.func @non_interfering_accesses(
     tf_device.return %0#0, %0#1, %0#2 : tensor<32xf32>, tensor<64xf32>, tensor<8xf32>
   }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> (tensor<32xf32>, tensor<64xf32>, tensor<8xf32>)
   // CHECK-NEXT: tf_device.return
-  // CHECK-NEXT: }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
+  // CHECK-NEXT: })
   // CHECK-NEXT: %[[READ:.*]] = "tf.ReadVariableOp"(%[[ARG_3]])
   %read3 = "tf.ReadVariableOp"(%arg3) : (tensor<*x!tf_type.resource<tensor<8xf32>>>) -> tensor<8xf32>
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_3]], %[[EXE]])
@@ -236,6 +240,7 @@ func.func @do_not_merge_multi_read(
   // CHECK-NEXT: %[[READ_1:.*]] = "tf.ReadVariableOp"(%[[ARG_0]])
   %read1 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf_type.resource<tensor<32xf32>>>) -> tensor<32xf32>
   // CHECK-NEXT: %[[EXE:.*]] = "tf_device.launch"
+  // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:TPU:0"}>
   // CHECK-NEXT: "tf.TPUExecute"(%[[READ_0]], %[[READ_1]], %[[ARG_1]])
   %execute = "tf_device.launch"() ({
     %0 = "tf.TPUExecute"(%read0, %read1, %arg1) {
@@ -244,7 +249,7 @@ func.func @do_not_merge_multi_read(
     tf_device.return %0 : tensor<32xf32>
   }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> tensor<32xf32>
   // CHECK-NEXT: tf_device.return
-  // CHECK-NEXT: }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
+  // CHECK-NEXT: })
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_0]], %[[EXE]])
   "tf.AssignVariableOp"(%arg0, %execute) : (tensor<*x!tf_type.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
   // CHECK-NEXT: return
@@ -265,6 +270,7 @@ func.func @do_not_merge_multi_assign(
   // CHECK-NEXT: %[[READ_0:.*]] = "tf.ReadVariableOp"(%[[ARG_0]])
   %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf_type.resource<tensor<32xf32>>>) -> tensor<32xf32>
   // CHECK-NEXT: %[[EXE:.*]]:2 = "tf_device.launch"
+  // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:TPU:0"}>
   // CHECK-NEXT: "tf.TPUExecute"(%[[READ_0]], %[[ARG_1]])
   %execute:2 = "tf_device.launch"() ({
     %0:2 = "tf.TPUExecute"(%read0, %arg1) {
@@ -273,7 +279,7 @@ func.func @do_not_merge_multi_assign(
     tf_device.return %0#0, %0#1 : tensor<32xf32>, tensor<32xf32>
   }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> (tensor<32xf32>, tensor<32xf32>)
   // CHECK-NEXT: tf_device.return
-  // CHECK-NEXT: }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"}
+  // CHECK-NEXT: })
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_0]], %[[EXE]]#0)
   "tf.AssignVariableOp"(%arg0, %execute#0) : (tensor<*x!tf_type.resource<tensor<32xf32>>>, tensor<32xf32>) -> ()
   // CHECK-NEXT: "tf.AssignVariableOp"(%[[ARG_0]], %[[EXE]]#1)
@@ -301,22 +307,22 @@ func.func @parallel_execute(
   // CHECK: "tf_device.parallel_execute"
   %pe:2 = "tf_device.parallel_execute"() ({
     // CHECK: "tf_device.launch"
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:0"
     %execute0 = "tf_device.launch"() ({
       // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[ARG_0]], %[[ARG_2]])
       %0 = "tf.TPUExecute"(%read0, %arg2) : (tensor<32xf32>, tensor<!tf_type.string>) -> tensor<32xf32>
       // CHECK-NEXT: tf_device.return
       tf_device.return %0 : tensor<32xf32>
-    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:TPU:0"
     }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> tensor<32xf32>
     tf_device.return %execute0 : tensor<32xf32>
   }, {
     // CHECK: "tf_device.launch"
+    // CHECK-SAME: device = "/job:localhost/replica:0/task:0/device:TPU:1"
     %execute1 = "tf_device.launch"() ({
       // CHECK-NEXT: "tf.TPUExecuteAndUpdateVariables"(%[[ARG_1]], %[[ARG_2]])
       %1 = "tf.TPUExecute"(%read1, %arg2) : (tensor<64xf32>, tensor<!tf_type.string>) -> tensor<64xf32>
       // CHECK-NEXT: tf_device.return
       tf_device.return %1 : tensor<64xf32>
-    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:TPU:1"
     }) {device = "/job:localhost/replica:0/task:0/device:TPU:1"} : () -> tensor<64xf32>
     tf_device.return %execute1 : tensor<64xf32>
   }) : () -> (tensor<32xf32>, tensor<64xf32>)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
index 55949182f5d291..9a903d73b76b8b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
@@ -61,9 +61,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
             // CHECK: %[[ID:.*]] = "tf.Identity"(%[[R0]])
             %id = "tf.Identity"(%arg30) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<*x!tf_type.resource<tensor<f32>>>
             // CHECK: "tf_device.launch"
+            // CHECK-SAME: device = "TPU_REPLICATED_CORE_0"
             // CHECK-NEXT: "tf.TPUReshardVariables"(%[[ID]], %[[R1]], %[[COMPILE]]#1, %[[R_STATE]])
             // CHECK-NEXT: tf_device.return
-            // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
             // CHECK: "tf.TPUExecuteAndUpdateVariables"(%[[ID]], %[[R1]], %[[COMPILE]]#1)
             "tf_device.launch"() ({
               "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
@@ -84,9 +84,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK-SAME: [%[[STATE0]], %[[STATE1]]] as %[[STATE:.*]]: tensor<!tf_type.resource<tensor<2x!tf_type.string>>>
     // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]
     // CHECK: "tf_device.launch"
+    // CHECK-SAME: device = "TPU_REPLICATED_CORE_0"
     // CHECK-NEXT: "tf.TPUReshardVariables"(%[[V0]], %[[V1]], %[[DEFAULT]], %[[STATE]])
     // CHECK-NEXT: tf_device.return
-    // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
     func.return
   }
 }
@@ -296,9 +296,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
             %id = "tf.Identity"(%arg30) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<*x!tf_type.resource<tensor<f32>>>
             // CHECK: "tf_device.parallel_execute"
             // CHECK: "tf_device.launch"
+            // CHECK-SAME: device = "TPU_REPLICATED_CORE_0"
             // CHECK-NEXT: "tf.TPUReshardVariables"(%[[ID]], %[[R1]], %[[COMPILE]]#1, %[[R_STATE]])
             // CHECK-NEXT: tf_device.return
-            // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
             // CHECK: "tf.TPUExecuteAndUpdateVariables"(%[[ID]], %[[R1]], %[[COMPILE]]#1)
 	    "tf_device.parallel_execute"() ({
               "tf_device.launch"() ({
@@ -324,9 +324,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK-SAME: [%[[STATE0]], %[[STATE1]]] as %[[STATE:.*]]: tensor<!tf_type.resource<tensor<2x!tf_type.string>>>
     // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]
     // CHECK: "tf_device.launch"
+    // CHECK-SAME: device = "TPU_REPLICATED_CORE_0"
     // CHECK-NEXT: "tf.TPUReshardVariables"(%[[V0]], %[[V1]], %[[DEFAULT]], %[[STATE]])
     // CHECK-NEXT: tf_device.return
-    // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
     func.return
   }
 }
@@ -391,9 +391,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
             // CHECK: %[[ID:.*]] = "tf.Identity"(%[[R0]])
             %id = "tf.Identity"(%arg30) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<*x!tf_type.resource<tensor<f32>>>
             // CHECK: "tf_device.launch"
+            // CHECK-SAME: device = "TPU_REPLICATED_CORE_0"
             // CHECK-NEXT: "tf.TPUReshardVariables"(%[[ID]], %[[R1]], %[[COMPILE]]#1, %[[R_STATE]])
             // CHECK-NEXT: tf_device.return
-            // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
             // CHECK: "tf.TPUExecuteAndUpdateVariables"(%[[ID]], %[[R1]], %[[COMPILE]]#1)
             "tf_device.launch"() ({
               "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
@@ -414,9 +414,9 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK-SAME: %[[ARG2]] as %[[V1:.*]]: tensor<*x!tf_type.resource<tensor<3x3x1x32xf32>>>
     // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]
     // CHECK: "tf_device.launch"
+    // CHECK-SAME: device = "TPU_REPLICATED_CORE_0"
     // CHECK-NEXT: "tf.TPUReshardVariables"(%[[V0]], %[[V1]], %[[DEFAULT]], %[[STATE]])
     // CHECK-NEXT: tf_device.return
-    // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
     func.return
   }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_composite_resource_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_composite_resource_ops.mlir
index b2896fa543f31d..62fe231555b647 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_composite_resource_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_composite_resource_ops.mlir
@@ -13,9 +13,9 @@ func.func @testReadVariableOpColocated(%arg0: tensor<*x!tf_type.resource<tensor<
     devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]},
     n = 2 : i32} {
      // CHECK:      %[[RESOURCE_OUT:.*]] = "tf_device.launch"()
+     // CHECK-SAME: TPU_REPLICATED_CORE_0
      // CHECK-NEXT:   %[[READ_OUT:.*]] = "tf.ReadVariableOp"(%[[RI_0]])
      // CHECK-NEXT:   tf_device.return %[[READ_OUT]]
-     // CHECK-NEXT: TPU_REPLICATED_CORE_0
      %0 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf_type.resource<tensor<4xf32>>>) -> tensor<4xf32>
      %1 = "tf.A"() : () -> (tensor<2x!tf_type.string>)
      "tf_device.launch"() ({
@@ -43,9 +43,9 @@ func.func @testReadVariableOpAfterIdentityColocated(%arg0: tensor<*x!tf_type.res
     n = 2 : i32} {
      // CHECK:      %[[IDENTITY_OUT:.*]] = "tf.Identity"(%[[RI_0]])
      // CHECK:      %[[RESOURCE_OUT:.*]] = "tf_device.launch"()
+     // CHECK-SAME: TPU_REPLICATED_CORE_0
      // CHECK-NEXT:   %[[READ_OUT:.*]] = "tf.ReadVariableOp"(%[[IDENTITY_OUT]])
      // CHECK-NEXT:   tf_device.return %[[READ_OUT]]
-     // CHECK-NEXT: TPU_REPLICATED_CORE_0
      %0 = "tf.Identity"(%arg1) : (tensor<*x!tf_type.resource<tensor<4xf32>>>) -> tensor<*x!tf_type.resource<tensor<4xf32>>>
      %1 = "tf.ReadVariableOp"(%0) : (tensor<*x!tf_type.resource<tensor<4xf32>>>) -> tensor<4xf32>
      %2 = "tf.A"() : () -> (tensor<2x!tf_type.string>)
@@ -77,9 +77,9 @@ func.func @testAssignVariableOpColocated(%arg0: tensor<*x!tf_type.resource<tenso
     n = 2 : i32} {
      // CHECK:      %[[VAL_OUT:.*]] = "tf.A"() : () -> tensor<4xf32>
      // CHECK:      "tf_device.launch"()
+     // CHECK-SAME: TPU_REPLICATED_CORE_0
      // CHECK-NEXT:   "tf.AssignVariableOp"(%[[RI_0]], %[[VAL_OUT]])
-     // CHECK-NEXT:   tf_device.return
-     // CHECK-NEXT: TPU_REPLICATED_CORE_0
+     // CHECK:   tf_device.return
      %1 = "tf.A"() : () -> (tensor<4xf32>)
      "tf.AssignVariableOp"(%arg1, %1) : (tensor<*x!tf_type.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
      %2 = "tf.B"() : () -> (tensor<2x!tf_type.string>)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index 8b128e5495a1f1..9796913d50cc15 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -611,9 +611,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   // CHECK-LABEL: func @no_replication_device
   func.func @no_replication_device() {
     "tf_device.cluster_func"() {_xla_compile_device_type = "TPU", _replication_info = "__no_replication_cluster", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "", device = "/job:worker/replica:0/task:0/device:TPU:1", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = [], use_spmd_for_xla_partitioning = false} : () -> ()
+    // CHECK: "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:TPU:1"}>
     // CHECK: tf.TPUExecute
     // CHECK-NEXT: tf_device.return
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:TPU:1"
     func.return
   }
   func.func @empty_func() {
@@ -629,9 +629,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   // CHECK-LABEL: func @no_replication_device
   func.func @no_replication_device() {
     "tf_device.cluster_func"() {_xla_compile_device_type = "TPU", _replication_info = "__no_replication_cluster", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "", device = "/job:worker/replica:0/task:0/device:CPU:0", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = [], use_spmd_for_xla_partitioning = false} : () -> ()
+    // CHECK: "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:TPU:0"}>
     // CHECK: tf.TPUExecute
     // CHECK-NEXT: tf_device.return
-    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:TPU:0"
     func.return
   }
   func.func @empty_func() {
@@ -709,20 +709,18 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
     %1 = "tf_device.cluster_func"(%0) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], use_spmd_for_xla_partitioning = false} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
-    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:CPU:0"}>
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
     // CHECK-SAME: metadata
     // CHECK-SAME: mlir_module
     // CHECK-SAME: func @main
     // CHECK-SAME: tf.B
     // CHECK-NOT: func = @tpu0_func
-    // CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
-    // CHECK: "tf_device.launch"
+    // CHECK: "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:CPU:0"}>
     // CHECK-NEXT: "tf.TPUCompileSucceededAssert"(%[[COMPILE_OUTPUT]]#0)
-    // CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
     // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf_device.launch"
+    // CHECK-SAME: device = "/job:worker/replica:0/task:0/device:TPU:0"
     // CHECK-NEXT: "tf.TPUExecute"(%[[A_OUTPUT]], %[[COMPILE_OUTPUT]]#1)
-    // CHECK: device = "/job:worker/replica:0/task:0/device:TPU:0"
 
     %2 = "tf.C"(%1) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[EXECUTE_OUTPUT]])
@@ -756,17 +754,15 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-SAME: n = 2
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[RI_0]])
-      // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
+      // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
       // CHECK-SAME: metadata
       // CHECK-SAME: mlir_module
       // CHECK-SAME: func @main
       // CHECK-SAME: tf.B
       // CHECK-NOT: func = @tpu0_func
-      // CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
-      // CHECK: "tf_device.launch"
+      // CHECK: "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT: "tf.TPUCompileSucceededAssert"(%[[COMPILE_OUTPUT]]#0)
-      // CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
       // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf_device.launch"
       // CHECK-NEXT: "tf.TPUExecute"(%[[RI_0]], %[[COMPILE_OUTPUT]]#1)
       %2 = "tf_device.cluster_func"(%ri_0) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], use_spmd_for_xla_partitioning = false} : (tensor<?xi32>) -> tensor<?xi32>
@@ -799,8 +795,8 @@ module attributes {tf.versions = {producer = 888 : i32}} {
 
     %1 = "tf_device.cluster_func"(%0) {device = "gpu0", func = @gpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], use_spmd_for_xla_partitioning = false} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: tf_device.cluster_func
-    // CHECK-SAME: device = "gpu0"
     // CHECK-SAME: func = @gpu0_func
+    // CHECK-SAME: device = "gpu0"
     // CHECK-SAME: num_cores_per_replica = 1
     // CHECK-SAME: step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP"
     // CHECK-NOT: metadata
@@ -826,7 +822,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
     %1 = "tf_device.cluster_func"(%0) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], use_spmd_for_xla_partitioning = false} : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[A_OUTPUT]])
-    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
+    // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:CPU:0"}>
     // CHECK-NEXT: "tf._TPUCompileMlir"(%[[A_SHAPE_OUTPUT]])
     // CHECK-SAME: metadata
     // CHECK-SAME: mlir_module
@@ -835,13 +831,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-SAME: func private @nested_func
     // CHECK-SAME: tf.D
     // CHECK-NOT: func = @tpu0_func
-    // CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
-    // CHECK: "tf_device.launch"
+    // CHECK: "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:CPU:0"}>
     // CHECK-NEXT: "tf.TPUCompileSucceededAssert"(%[[COMPILE_OUTPUT]]#0)
-    // CHECK: device = "/job:worker/replica:0/task:0/device:CPU:0"
-    // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf_device.launch"
+    // CHECK: %[[EXECUTE_OUTPUT:[0-9]*]] = "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:TPU:0"}>
     // CHECK-NEXT: "tf.TPUExecute"(%[[A_OUTPUT]], %[[COMPILE_OUTPUT]]#1)
-    // CHECK: device = "/job:worker/replica:0/task:0/device:TPU:0"
 
     %2 = "tf.C"(%1) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[EXECUTE_OUTPUT]])
@@ -1198,14 +1191,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:3 = "tf_device.launch"
     // CHECK-NEXT: "tf._TPUCompileMlir"()
     // CHECK: "tf_device.launch"
-    // CHECK-NEXT: "tf.TPUCompileSucceededAssert"(%[[COMPILE_OUTPUT]]#0)
+    // CHECK: "tf.TPUCompileSucceededAssert"(%[[COMPILE_OUTPUT]]#0)
     // CHECK: [[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2 = "tf_device.parallel_execute"
-    // CHECK: "tf_device.launch"
+    // CHECK: "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:TPU:0"}>
     // CHECK-NEXT: "tf.TPUExecute"(%[[READ_VAR_0]], %[[COMPILE_OUTPUT]]#1)
-    // CHECK: device = "/job:worker/replica:0/task:0/device:TPU:0"
-    // CHECK: "tf_device.launch"
+    // CHECK: "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:TPU:1"}>
     // CHECK-NEXT: "tf.TPUExecute"(%[[READ_VAR_1]], %[[COMPILE_OUTPUT]]#2)
-    // CHECK: device = "/job:worker/replica:0/task:0/device:TPU:1"
     %computation = "tf_device.cluster_func"(%partitioned_input) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @computation, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], input_sharding_configuration = [""], output_sharding_configuration = [""], use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
     // CHECK-NOT: tf.TPUPartitionedOutputV2
     %partitioned_output:2 = "tf.TPUPartitionedOutputV2"(%computation) {N = 2 : i64, partition_dims = []} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
@@ -1238,12 +1229,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK: "tf_device.launch"
     // CHECK-NEXT: "tf.TPUCompileSucceededAssert"(%[[COMPILE_OUTPUT]]#0)
     // CHECK: [[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2 = "tf_device.parallel_execute"
-    // CHECK: "tf_device.launch"
+    // CHECK: "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:TPU:0"}>
     // CHECK-NEXT: "tf.TPUExecute"(%[[READ_VAR_0]], %[[COMPILE_OUTPUT]]#1)
-    // CHECK: device = "/job:worker/replica:0/task:0/device:TPU:0"
-    // CHECK: "tf_device.launch"
+    // CHECK: "tf_device.launch"() <{device = "/job:worker/replica:0/task:0/device:TPU:1"}>
     // CHECK-NEXT: "tf.TPUExecute"(%[[READ_VAR_1]], %[[COMPILE_OUTPUT]]#2)
-    // CHECK: device = "/job:worker/replica:0/task:0/device:TPU:1"
     %computation = "tf_device.cluster_func"(%partitioned_input) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @computation, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], input_sharding_configuration = ["\08\03\1A\02\01\02\22\02\00\01"], output_sharding_configuration = ["\08\03\1A\02\01\02\22\02\00\01"], use_spmd_for_xla_partitioning = true} : (tensor<3x4xf32>) -> tensor<3x4xf32>
     // CHECK-NOT: tf.TPUPartitionedOutputV2
     %partitioned_output:2 = "tf.TPUPartitionedOutputV2"(%computation) {_XlaSharding = "\08\03\1A\02\01\02\22\02\00\01", partition_dims = [1, 2]} : (tensor<3x4xf32>) -> (tensor<3x2xf32>, tensor<3x2xf32>)
@@ -1443,10 +1432,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
       // CHECK: "tf_device.parallel_execute"
       // CHECK-NOT:"tf._XlaCompileMlirPlaceholderProgramKey"
       // CHECK:    "tf.D"(%[[COMPILE_OUTPUT]]#1
+      // CHECK:    "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_0"}>
       // CHECK:    "tf.TPUExecute"
-      // CHECK:      device = "TPU_REPLICATED_CORE_0"
+      // CHECK:     "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_1"}>
       // CHECK:    "tf.TPUExecute"
-      // CHECK:      device = "TPU_REPLICATED_CORE_1"
       // CHECK-NOT:    "tf.TPUExecute"
       %3 = "tf_device.parallel_execute"() ({
          %program = "tf._XlaCompileMlirPlaceholderProgramKey"() : () -> tensor<3x!tf_type.string>
@@ -1485,10 +1474,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
       // CHECK: "tf._TPUCompileMlir"
       // CHECK: "tf.TPUCompileSucceededAssert"
       // CHECK: "tf_device.parallel_execute"
+      // CHECK:    "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_0"}>
       // CHECK:    "tf.TPUExecute"
-      // CHECK:      device = "TPU_REPLICATED_CORE_0"
+      // CHECK:    "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_1"}>
       // CHECK:    "tf.TPUExecute"
-      // CHECK:      device = "TPU_REPLICATED_CORE_1"
       // CHECK-NOT:    "tf.TPUExecute"
       // CHECK-NOT:"tf._XlaCompileMlirPlaceholderProgramKey"
       // CHECK:    "tf.D"(%[[COMPILE_OUTPUT]]#1
@@ -1524,23 +1513,19 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"]} {
   // CHECK-LABEL: func @non_replicated_parallel_execute
   func.func @non_replicated_parallel_execute(%arg0: tensor<8xi32>) -> tensor<8xi32> {
-    // CHECK:      %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
+    // CHECK:      %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
     // CHECK-NEXT:   "tf._TPUCompileMlir"()
     // CHECK-NEXT:   tf_device.return
-    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
-    // CHECK:      "tf_device.launch"
+    // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
     // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
     // CHECK-NEXT:   tf_device.return
-    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK:      "tf_device.parallel_execute"
-    // CHECK-NEXT:   "tf_device.launch"
+    // CHECK-NEXT:   "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:TPU:0"}>
     // CHECK-NEXT:     "tf.TPUExecute"
     // CHECK-NEXT:     tf_device.return
-    // CHECK-NEXT:   device = "/job:localhost/replica:0/task:0/device:TPU:0"
-    // CHECK:        "tf_device.launch"
+    // CHECK:        "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:TPU:1"}>
     // CHECK-NEXT:     "tf.TPUExecute"
     // CHECK-NEXT:     tf_device.return
-    // CHECK-NEXT:   device = "/job:localhost/replica:0/task:0/device:TPU:1"
     %0 = "tf_device.cluster_func"(%arg0) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], use_spmd_for_xla_partitioning = false} : (tensor<8xi32>) -> tensor<8xi32>
     func.return %0 : tensor<8xi32>
   }
@@ -1587,23 +1572,19 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
     // CHECK: tf_device.replicate
     // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"], TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"], TPU_REPLICATED_HOST_0 = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:1/device:CPU:0"], TPU_REPLICATED_HOST_1 = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:1/device:CPU:0"]}
     %0:2 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<8xi32>) {n = 2 : i32} {
-      // CHECK-NEXT: %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
+      // CHECK-NEXT: %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf._TPUCompileMlir"()
       // CHECK-NEXT:   tf_device.return
-      // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
-      // CHECK:      "tf_device.launch"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
       // CHECK-NEXT:   tf_device.return
-      // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
       // CHECK:      "tf_device.parallel_execute"
-      // CHECK-NEXT:   "tf_device.launch"
+      // CHECK-NEXT:   "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_0"}>
       // CHECK-NEXT:     "tf.TPUExecute"
       // CHECK-NEXT:     tf_device.return
-      // CHECK-NEXT:   device = "TPU_REPLICATED_CORE_0"
-      // CHECK:        "tf_device.launch"
+      // CHECK:        "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_1"}>
       // CHECK-NEXT:     "tf.TPUExecute"
       // CHECK-NEXT:     tf_device.return
-      // CHECK-NEXT:   device = "TPU_REPLICATED_CORE_1"
       %1 = "tf_device.cluster_func"(%ri) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], use_spmd_for_xla_partitioning = false} : (tensor<8xi32>) -> tensor<8xi32>
       tf_device.return %1 : tensor<8xi32>
     }
@@ -1632,13 +1613,11 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
       // CHECK:      %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
       // CHECK:      "tf._TPUCompileMlir"
       // CHECK:      %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]] = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_0"}>
       // CHECK-NEXT:     %[[EXECUTE_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[RI_0]], %[[RI_1]], %[[RI_2]], %[[COMPILE]]#1)
       // CHECK-NEXT:     tf_device.return %[[EXECUTE_OUTPUT]]
-      // CHECK-NEXT:   device = "TPU_REPLICATED_CORE_0"
-      // CHECK:        "tf_device.launch"
+      // CHECK:        "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_1"}>
       // CHECK-NEXT:     "tf.TPUExecute"(%[[RI_1]], %[[RI_2]], %[[COMPILE]]#2)
-      // CHECK:        device = "TPU_REPLICATED_CORE_1"
       %1 = "tf_device.cluster_func"(%ri, %ri2, %ri3) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "", ""], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], use_spmd_for_xla_partitioning = false} : (tensor<8xi32>, tensor<*xi1>, tensor<*xi32>) -> tensor<8xi32>
       tf_device.return %1 : tensor<8xi32>
     }
@@ -1663,20 +1642,16 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
     // CHECK-SAME: TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"]
     // CHECK-SAME: TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]
     %0:2 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<8xi32>) {n = 2 : i32} {
-      // CHECK-NEXT: %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
+      // CHECK-NEXT: %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf._TPUCompileMlir"()
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
-      // CHECK:      "tf_device.launch"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
       // CHECK:      %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]] = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_0"}>
       // CHECK-NEXT:     %[[EXECUTE_OUTPUT:[0-9]*]] = "tf.TPUExecute"
       // CHECK-NEXT:     tf_device.return %[[EXECUTE_OUTPUT]]
-      // CHECK-NEXT:   device = "TPU_REPLICATED_CORE_0"
-      // CHECK:        "tf_device.launch"
+      // CHECK:        "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_1"}>
       // CHECK-NEXT:     "tf.TPUExecute"
-      // CHECK:        device = "TPU_REPLICATED_CORE_1"
       %1 = "tf_device.cluster_func"(%ri) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], use_spmd_for_xla_partitioning = false} : (tensor<8xi32>) -> tensor<8xi32>
       tf_device.return %1 : tensor<8xi32>
     }
@@ -1700,21 +1675,17 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
     // CHECK-SAME: TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"]
     // CHECK-SAME: TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]
     %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<8xi32>) {n = 2 : i32} {
-      // CHECK-NEXT: %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
+      // CHECK-NEXT: %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf._TPUCompileMlir"()
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
-      // CHECK:      "tf_device.launch"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
       // CHECK:      %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:3 = "tf_device.parallel_execute"
-      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
+      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_0"}>
       // CHECK-NEXT:     %[[EXECUTE_0_OUTPUT:[0-9]*]]:2 = "tf.TPUExecute"
       // CHECK-NEXT:     tf_device.return %[[EXECUTE_0_OUTPUT]]
-      // CHECK-NEXT:   device = "TPU_REPLICATED_CORE_0"
-      // CHECK:        %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK:        %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_1"}>
       // CHECK-NEXT:     %[[EXECUTE_1_OUTPUT:[0-9]*]] = "tf.TPUExecute"
       // CHECK-NEXT:     tf_device.return %[[EXECUTE_1_OUTPUT]]
-      // CHECK:        device = "TPU_REPLICATED_CORE_1"
       %1, %2 = "tf_device.cluster_func"(%ri) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""], use_spmd_for_xla_partitioning = false} : (tensor<8xi32>) -> (tensor<*xi32>, tensor<*xi1>)
       tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
     }
@@ -1763,25 +1734,21 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
     // CHECK-SAME: TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"]
     // CHECK-SAME: TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]
     %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri_1: tensor<128x10xf32>, [%arg2, %arg3] as %ri_2: tensor<*xi32>) {n = 2 : i32} {
-      // CHECK:      %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
+      // CHECK:      %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf._TPUCompileMlir"
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
-      // CHECK:      "tf_device.launch"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
       //
       // CHECK:      %[[CONST_SPLIT_DIM:.*]] = "tf.Const"()
       // CHECK:      %[[SPLIT_OUT:[a-z0-9]+]]:2 = "tf.Split"(%[[CONST_SPLIT_DIM]], %[[RI_0]])
       // CHECK:      %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:3 = "tf_device.parallel_execute"
-      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
+      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_0"}>
       //
       // CHECK-NEXT:     %[[EXECUTE_0_OUTPUT:[0-9]*]]:2 = "tf.TPUExecute"(%[[SPLIT_OUT]]#0, %[[COMPILE]]#1)
       // CHECK-NEXT:     tf_device.return %[[EXECUTE_0_OUTPUT]]
-      // CHECK-NEXT:   device = "TPU_REPLICATED_CORE_0"
-      // CHECK:        %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK:        %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_1"}>
       // CHECK-NEXT:     %[[EXECUTE_1_OUTPUT:[0-9]*]] = "tf.TPUExecute"(%[[SPLIT_OUT]]#1, %[[RI_1]], %[[COMPILE]]#2)
       // CHECK-NEXT:     tf_device.return %[[EXECUTE_1_OUTPUT]]
-      // CHECK:        device = "TPU_REPLICATED_CORE_1"
       %1, %2 = "tf_device.cluster_func"(%ri_1, %ri_2) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @tpu0_func, num_cores_per_replica = 2, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", topology = "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0], input_sharding_configuration = ["\08\03\1A\02\01\02\22\02\00\01", "\08\01\1A\01\01\22\01\01"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", ""], use_spmd_for_xla_partitioning = false} : (tensor<128x10xf32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi1>)
       tf_device.return %1, %2 : tensor<*xi32>, tensor<*xi1>
     }
@@ -1830,22 +1797,18 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
     // CHECK-SAME: TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"]
     // CHECK-SAME: TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]
     %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri_1: tensor<128x10xf32>, [%arg2, %arg3] as %ri_2: tensor<*xi32>) {n = 2 : i32} {
-      // CHECK:      %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
+      // CHECK:      %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf._TPUCompileMlir"
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
-      // CHECK:      "tf_device.launch"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
       //
       // CHECK:      %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:3 = "tf_device.parallel_execute"
-      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
+      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_0"}>
       // CHECK-NEXT:     %[[EXECUTE_0_OUTPUT:[0-9]*]]:2 = "tf.TPUExecute"
       // CHECK-NEXT:     tf_device.return %[[EXECUTE_0_OUTPUT]]
-      // CHECK-NEXT:   device = "TPU_REPLICATED_CORE_0"
-      // CHECK:        %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK:        %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_1"}>
       // CHECK-NEXT:     %[[EXECUTE_1_OUTPUT:[0-9]*]] = "tf.TPUExecute"
       // CHECK-NEXT:     tf_device.return %[[EXECUTE_1_OUTPUT]]
-      // CHECK:        device = "TPU_REPLICATED_CORE_1"
       //
       // CHECK:     %[[CONST_CONCAT_DIM:.*]] = "tf.Const"()
       // CHECK:     %[[CONCAT_OUTPUT:[0-9]*]] = "tf.Concat"(%[[CONST_CONCAT_DIM]], %[[PARALLEL_EXECUTE_OUTPUT]]#0, %[[PARALLEL_EXECUTE_OUTPUT]]#2
@@ -1899,22 +1862,18 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
     // CHECK-SAME: TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"]
     // CHECK-SAME: TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]
     %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri_1: tensor<128x10xf32>, [%arg2, %arg3] as %ri_2: tensor<*xi32>) {n = 2 : i32} {
-      // CHECK:      %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
+      // CHECK:      %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf._TPUCompileMlir"
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
-      // CHECK:      "tf_device.launch"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
       //
       // CHECK:      %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:3 = "tf_device.parallel_execute"
-      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
+      // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_0"}>
       // CHECK-NEXT:     %[[EXECUTE_0_OUTPUT:[0-9]*]]:2 = "tf.TPUExecute"
       // CHECK-NEXT:     tf_device.return %[[EXECUTE_0_OUTPUT]]
-      // CHECK-NEXT:   device = "TPU_REPLICATED_CORE_0"
-      // CHECK:        %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK:        %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"() <{device = "TPU_REPLICATED_CORE_1"}>
       // CHECK-NEXT:     %[[EXECUTE_1_OUTPUT:[0-9]*]] = "tf.TPUExecute"
       // CHECK-NEXT:     tf_device.return %[[EXECUTE_1_OUTPUT]]
-      // CHECK:        device = "TPU_REPLICATED_CORE_1"
       //
       // CHECK:     %[[CONST_CONCAT_DIM:.*]] = "tf.Const"()
       // CHECK:     %[[CONCAT_OUTPUT:[0-9]*]] = "tf.Concat"(%[[CONST_CONCAT_DIM]], %[[PARALLEL_EXECUTE_OUTPUT]]#1, %[[PARALLEL_EXECUTE_OUTPUT]]#2
@@ -2091,12 +2050,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
     // CHECK-SAME: [%[[ARG_0]], %[[ARG_1]]] as %[[RI_0:[a-z0-9]*]]: tensor<128x10xf32>
     // CHECK-SAME: [%[[ARG_2]], %[[ARG_3]]] as %[[RI_1:[a-z0-9]*]]: tensor<*xi32>
     %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri_1: tensor<128x10xf32>, [%arg2, %arg3] as %ri_2: tensor<*xi32>) {n = 2 : i32} {
-      // CHECK:      %[[COMPILE:[a-z0-9]+]]:5 = "tf_device.launch"
+      // CHECK:      %[[COMPILE:[a-z0-9]+]]:5 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf._TPUCompileMlir"
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
-      // CHECK:      "tf_device.launch"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
       // CHECK:      %[[CONST_SPLIT_0_DIM:.*]] = "tf.Const"()
       // CHECK:      %[[SPLIT_0_OUT:[a-z0-9]+]]:2 = "tf.Split"(%[[CONST_SPLIT_0_DIM]], %[[RI_0]])
       // CHECK:      %[[CONST_SPLIT_1_DIM:.*]] = "tf.Const"()
@@ -2198,12 +2155,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
     // CHECK-SAME: [%[[ARG_0]], %[[ARG_1]]] as %[[RI_0:[a-z0-9]*]]: tensor<128x10xf32>
     // CHECK-SAME: [%[[ARG_2]], %[[ARG_3]]] as %[[RI_1:[a-z0-9]*]]: tensor<*xi32>
     %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri_1: tensor<128x10xf32>, [%arg2, %arg3] as %ri_2: tensor<*xi32>) {n = 2 : i32} {
-      // CHECK:      %[[COMPILE:[a-z0-9]+]]:5 = "tf_device.launch"
+      // CHECK:      %[[COMPILE:[a-z0-9]+]]:5 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf._TPUCompileMlir"
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
-      // CHECK:      "tf_device.launch"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
       // CHECK:      %[[CONST_SPLIT_0_DIM:.*]] = "tf.Const"()
       // CHECK:      %[[SPLIT_0_OUT:[a-z0-9]+]]:2 = "tf.Split"(%[[CONST_SPLIT_0_DIM]], %[[RI_0]])
       // CHECK:      %[[CONST_SPLIT_1_DIM:.*]] = "tf.Const"()
@@ -2282,12 +2237,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
     // CHECK-SAME: [%[[ARG_0]], %[[ARG_1]]] as %[[RI_0:[a-z0-9]*]]: tensor<128x10xf32>
     // CHECK-SAME: [%[[ARG_2]], %[[ARG_3]]] as %[[RI_1:[a-z0-9]*]]: tensor<*xi32>
     %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri_1: tensor<128x10xf32>, [%arg2, %arg3] as %ri_2: tensor<*xi32>) {n = 2 : i32} {
-      // CHECK:      %[[COMPILE:[a-z0-9]+]]:5 = "tf_device.launch"
+      // CHECK:      %[[COMPILE:[a-z0-9]+]]:5 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf._TPUCompileMlir"
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
-      // CHECK:      "tf_device.launch"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
       // CHECK:      %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:5 = "tf_device.parallel_execute"
       // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
       // CHECK-NEXT:     %[[EXECUTE_0_OUTPUT:[0-9]*]]:2 = "tf.TPUExecute"
@@ -2367,12 +2320,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
     // CHECK-SAME: [%[[ARG_0]], %[[ARG_1]]] as %[[RI_0:[a-z0-9]*]]: tensor<128x10xf32>
     // CHECK-SAME: [%[[ARG_2]], %[[ARG_3]]] as %[[RI_1:[a-z0-9]*]]: tensor<*xi32>
     %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri_1: tensor<128x10xf32>, [%arg2, %arg3] as %ri_2: tensor<*xi32>) {n = 2 : i32} {
-      // CHECK:      %[[COMPILE:[a-z0-9]+]]:5 = "tf_device.launch"
+      // CHECK:      %[[COMPILE:[a-z0-9]+]]:5 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf._TPUCompileMlir"
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
-      // CHECK:      "tf_device.launch"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
       // CHECK:      %[[CONST_SPLIT_0_DIM:.*]] = "tf.Const"()
       // CHECK:      %[[SPLIT_0_OUT:[a-z0-9]+]]:2 = "tf.Split"(%[[CONST_SPLIT_0_DIM]], %[[RI_0]])
       // CHECK:      %[[CONST_SPLIT_1_DIM:.*]] = "tf.Const"()
@@ -2451,12 +2402,10 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
     // CHECK-SAME: [%[[ARG_0]], %[[ARG_1]]] as %[[RI_0:[a-z0-9]*]]: tensor<128x10xf32>
     // CHECK-SAME: [%[[ARG_2]], %[[ARG_3]]] as %[[RI_1:[a-z0-9]*]]: tensor<*xi32>
     %0:2, %1:2 = tf_device.replicate([%arg0, %arg1] as %ri_1: tensor<128x10xf32>, [%arg2, %arg3] as %ri_2: tensor<*xi32>) {n = 2 : i32} {
-      // CHECK:      %[[COMPILE:[a-z0-9]+]]:5 = "tf_device.launch"
+      // CHECK:      %[[COMPILE:[a-z0-9]+]]:5 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf._TPUCompileMlir"
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
-      // CHECK:      "tf_device.launch"
+      // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE]]#0)
-      // CHECK:      device = "/job:localhost/replica:0/task:0/device:CPU:0"
       // CHECK:      %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:5 = "tf_device.parallel_execute"
       // CHECK-NEXT:   %[[LAUNCH_0_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
       // CHECK-NEXT:     %[[EXECUTE_0_OUTPUT:[0-9]*]]:2 = "tf.TPUExecute"
@@ -2613,14 +2562,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"}, tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1199 : i32}} {
   func.func @return_from_host_and_tpu() -> (tensor<?xi32>, tensor<?x!tf_type.string>) attributes {tf._construction_context = "kEagerRuntime", tf.signature.is_stateful} {
       // CHECK:     %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2 = "tf_device.parallel_execute"
-      // CHECK:       %[[LAUNCH_0_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK:       %[[LAUNCH_0_OUTPUT:[0-9]*]] = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
       // CHECK:         %[[B_OUTPUT:[0-9]*]] = "tf.B"
       // CHECK:         tf_device.return %[[B_OUTPUT:[0-9]*]]
-      // CHECK:       device = "/job:localhost/replica:0/task:0/device:CPU:0"
-      // CHECK:       %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK:       %[[LAUNCH_1_OUTPUT:[0-9]*]] = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:TPU:0"}>
       // CHECK-NEXT:    %[[EXECUTE_1_OUTPUT:[0-9]*]] = "tf.TPUExecute"
       // CHECK:         tf_device.return %[[EXECUTE_1_OUTPUT]]
-      // CHECK:       device = "/job:localhost/replica:0/task:0/device:TPU:0"
       // CHECK:    return %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]#1, %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]#0
     %0:2 = "tf_device.parallel_execute"() ({
       %1 = "tf_device.launch"() ({
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite_v2.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite_v2.mlir
index e79eb9f0b1fbab..e36bdaa72e41b8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite_v2.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/xla_rewrite_v2.mlir
@@ -5,11 +5,11 @@ module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/
   // CHECK-LABEL: func.func @convert_cluster_func
   func.func @convert_cluster_func(%arg0: tensor<i32>) -> tensor<i32> {
     // CHECK: "tf_device.launch"()
+    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
     // CHECK: "tf._XlaCompile"(%arg0) <{function = @func, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 0>}> : (tensor<i32>) -> (tensor<3x!tf_type.string>, tensor<!tf_type.boolref>)
-    // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
     // CHECK: "tf_device.launch"()
+    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
     // CHECK: "tf._XlaRun"(%arg0, %0#0) : (tensor<i32>, tensor<3x!tf_type.string>) -> tensor<i32>
-    // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : () -> tensor<i32>
     %0 = "tf_device.cluster_func"(%arg0) {func = @func, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<i32>) -> tensor<i32>
     func.return %0 : tensor<i32>
   }
@@ -25,11 +25,11 @@ module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/
   // CHECK-LABEL: func.func @convert_cluster_func_with_resources_in_order
   func.func @convert_cluster_func_with_resources_in_order(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
     // CHECK: "tf_device.launch"()
+    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
     // CHECK: "tf._XlaCompile"(%arg1, %arg0) <{function = @func_with_resources_in_order, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 1>}> : (tensor<i32>, tensor<!tf_type.resource>)
-    // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
     // CHECK: "tf_device.launch"()
+    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
     // CHECK: "tf._XlaRun"(%arg1, %arg0, %0#0) : (tensor<i32>, tensor<!tf_type.resource>, tensor<3x!tf_type.string>) -> tensor<i32>
-    // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : () -> tensor<i32>
     %0 = "tf_device.cluster_func"(%arg1, %arg0) {func = @func_with_resources_in_order, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<i32>)
     func.return %0 : tensor<i32>
   }
@@ -45,18 +45,18 @@ module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/
   // CHECK-LABEL: func.func @convert_cluster_func_with_resources
   func.func @convert_cluster_func_with_resources(%arg0: tensor<!tf_type.resource>, %arg1: tensor<i32>) -> tensor<i32> {
     // CHECK: "tf_device.launch"()
+    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
     // CHECK: "tf._XlaCompile"(%arg1, %arg0) <{function = @func_with_resources_1, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 1>}> : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<3x!tf_type.string>, tensor<!tf_type.boolref>)
-    // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
     // CHECK: "tf_device.launch"()
+    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
     // CHECK: "tf._XlaRun"(%arg1, %arg0, %0#0) : (tensor<i32>, tensor<!tf_type.resource>, tensor<3x!tf_type.string>) -> tensor<i32>
-    // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : () -> tensor<i32>
     %0 = "tf_device.cluster_func"(%arg0, %arg1) {func = @func_with_resources_1, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.resource>, tensor<i32>) -> tensor<i32>
     // CHECK: "tf_device.launch"()
+    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
     // CHECK: "tf._XlaCompile"(%arg1, %arg0) <{function = @func_with_resources_2, must_compile = true, operandSegmentSizes = array<i32: 0, 1, 1>}> : (tensor<i32>, tensor<!tf_type.resource>) -> (tensor<3x!tf_type.string>, tensor<!tf_type.boolref>)
-    // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
     // CHECK: "tf_device.launch"()
+    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
     // CHECK: "tf._XlaRun"(%arg1, %arg0, %2#0) : (tensor<i32>, tensor<!tf_type.resource>, tensor<3x!tf_type.string>) -> tensor<i32>
-    // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"} : () -> tensor<i32>
     %1 = "tf_device.cluster_func"(%arg0, %arg1) {func = @func_with_resources_2, device = "/job:localhost/replica:0/task:0/device:GPU:0"} : (tensor<!tf_type.resource>, tensor<i32>) -> tensor<i32>
     return %0 : tensor<i32>
   }
@@ -77,16 +77,16 @@ module attributes {tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/
 module attributes {tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0"], tf.versions = {producer = 888 : i32}} {
   func.func @outside_compilation_in_generic_pipeline(%arg0: tensor<2xi32>) -> tensor<2xi32> {
     // CHECK: tf_device.launch
+    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
     // CHECK: "tf._XlaCompile"() <{function = @func, must_compile = true, operandSegmentSizes = array<i32: 0, 0, 0>}>
-    // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
     // CHECK: tf_device.parallel_execute
     // CHECK: tf_device.launch
+    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
     // CHECK: tf.B
     // CHECK: tf._XlaSendFromHost
-    // CHECK: {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
     // CHECK: tf_device.launch
+    // CHECK-SAME: <{device = "/job:localhost/replica:0/task:0/device:GPU:0"}>
     // CHECK: tf._XlaRun
-    // CHECK: {device = "/job:localhost/replica:0/task:0/device:GPU:0"}
     %0 = "tf_device.parallel_execute"() ({
       "tf_device.launch"() ({
         %1 = "tf._XlaCompileMlirPlaceholderProgramKey"() : () -> tensor<3x!tf_type.string>
diff --git a/tensorflow/compiler/mlir/tfrt/tests/runtime_lowering_tpu.mlir b/tensorflow/compiler/mlir/tfrt/tests/runtime_lowering_tpu.mlir
index 5225c2eed9bb09..d6ffe03d2708e5 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/runtime_lowering_tpu.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/runtime_lowering_tpu.mlir
@@ -4,7 +4,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
   // CHECK-LABEL: @converts_cluster
   func.func @converts_cluster() {
-    // CHECK: %0:2 = "tf_device.launch"() ({
+    // CHECK: %0:2 = "tf_device.launch"() <{{.*}}> ({
     // CHECK: %compilation_status, %program = "tf._TPUCompileMlir"()
     "tf_device.cluster_func"() {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = [], use_spmd_for_xla_partitioning = false} : () -> ()
     func.return
@@ -26,4 +26,4 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   func.func @empty_func() {
     func.return
   }
-}
\ No newline at end of file
+}
diff --git a/tensorflow/dtensor/mlir/tests/move_compilation_to_host.mlir b/tensorflow/dtensor/mlir/tests/move_compilation_to_host.mlir
index 165e037c9a3541..06f9b2fe103d14 100644
--- a/tensorflow/dtensor/mlir/tests/move_compilation_to_host.mlir
+++ b/tensorflow/dtensor/mlir/tests/move_compilation_to_host.mlir
@@ -53,7 +53,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   // CHECK-LABEL: func private @_func_1
   // CHECK-SAME:  %[[ARG0:.*]]: tensor<i32>
   func.func private @_func_1(%arg0: tensor<i32>) -> tensor<f32> {
-    // CHECK:      %[[COMPILE_OUT:.*]]:2 = "tf_device.launch"()
+    // CHECK:      %[[COMPILE_OUT:.*]]:2 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
     // CHECK-NEXT:   %[[COMPILATION_STATUS:.*]], %[[PROGRAM_KEY:.*]] = "tf._TPUCompileMlir"()
     // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
     // CHECK-SAME:   recv_device = "/job:localhost/replica:0/task:0/device:CPU:0"
@@ -72,12 +72,10 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK-SAME:   tensor_name = "compilation_send_recv_key_1
     // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   tf_device.return %[[COMPILATION_STATUS]], %[[PROGRAM_KEY]]
-    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
-    // CHECK-NEXT: "tf_device.launch"()
+    // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
     // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE_OUT]]#0)
     // CHECK-NEXT:   tf_device.return
-    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
-    // CHECK-NEXT:   %[[ID_TO_ORDINAL:.*]] = "tf.Const"
+    // CHECK:        %[[ID_TO_ORDINAL:.*]] = "tf.Const"
     // CHECK-SAME:   value = dense<0>
     // CHECK-NEXT:   %[[SIZE_TYPE:.*]] = "tf.Const"
     // CHECK-SAME:   value = dense<1>
@@ -165,7 +163,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   // CHECK-LABEL: func private @_func_1
   // CHECK-SAME:  %[[ARG0:.*]]: tensor<i32>
   func.func private @_func_1(%arg0: tensor<i32>) -> tensor<f32> {
-    // CHECK:      %[[COMPILE_OUT:.*]]:2 = "tf_device.launch"()
+    // CHECK:      %[[COMPILE_OUT:.*]]:2 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
     // CHECK-NEXT:   %[[COMPILATION_STATUS:.*]], %[[PROGRAM_KEY:.*]] = "tf._TPUCompileMlir"()
     // CHECK-NEXT:   "tf._HostSend"(%[[PROGRAM_KEY]])
     // CHECK-SAME:   recv_device = "/job:localhost/replica:0/task:0/device:CPU:0"
@@ -185,12 +183,10 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK-SAME:   tensor_name = "compilation_send_recv_key_1
     // CHECK-SAME:   device = "/job:localhost/replica:0/task:0/device:CPU:0"
     // CHECK-NEXT:   tf_device.return %[[COMPILATION_STATUS]], %[[PROGRAM_KEY]]
-    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
-    // CHECK-NEXT: "tf_device.launch"()
+    // CHECK:      "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:CPU:0"}>
     // CHECK-NEXT:   "tf.TPUCompileSucceededAssert"(%[[COMPILE_OUT]]#0)
     // CHECK-NEXT:   tf_device.return
-    // CHECK-NEXT: device = "/job:localhost/replica:0/task:0/device:CPU:0"
-    // CHECK-NEXT: %[[ID_TO_ORDINAL:.*]] = "tf.Const"
+    // CHECK:      %[[ID_TO_ORDINAL:.*]] = "tf.Const"
     // CHECK-SAME: value = dense<0>
     // CHECK-NEXT: %[[SIZE_TYPE:.*]] = "tf.Const"
     // CHECK-SAME: value = dense<1>
diff --git a/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir b/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir
index 339da803cf4e58..091a9b2d1ef6fd 100644
--- a/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir
+++ b/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir
@@ -133,25 +133,25 @@ module @test_inferred_resource_attributes attributes {dtensor.all_reduce_combine
 // CHECK-SAME: %arg0: tensor<1x2xi32> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}
 // CHECK-SAME: %arg1: tensor<1x2xi32> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:1"}
 // CHECK-SAME: -> (tensor<2xi32>, tensor<2xi32>) {
-// CHECK-NEXT:   %0:2 = "tf_device.launch"() ({
+// CHECK-NEXT:   %0:2 = "tf_device.launch"() <{device = ""}> ({
 // CHECK-NEXT:     %compilation_status, %program = "tf._TPUCompileMlir"() <{metadata = ""}> : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
 // CHECK-NEXT:     tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
-// CHECK-NEXT:   }) {device = ""} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
-// CHECK-NEXT:   "tf_device.launch"() ({
+// CHECK-NEXT:   }) : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+// CHECK-NEXT:   "tf_device.launch"() <{device = ""}> ({
 // CHECK-NEXT:     "tf.TPUCompileSucceededAssert"(%0#0) : (tensor<!tf_type.string>) -> ()
 // CHECK-NEXT:     tf_device.return
-// CHECK-NEXT:   }) {device = ""} : () -> ()
+// CHECK-NEXT:   }) : () -> ()
 // CHECK-NEXT:   %1:2 = "tf_device.parallel_execute"() ({
-// CHECK-NEXT:     %2 = "tf_device.launch"() ({
+// CHECK-NEXT:     %2 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:TPU:0"}> ({
 // CHECK-NEXT:       %3 = "tf.TPUExecute"(%arg0, %0#1) : (tensor<1x2xi32>, tensor<3x!tf_type.string>) -> tensor<2xi32>
 // CHECK-NEXT:       tf_device.return %3 : tensor<2xi32>
-// CHECK-NEXT:     }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> tensor<2xi32>
+// CHECK-NEXT:     }) : () -> tensor<2xi32>
 // CHECK-NEXT:     tf_device.return %2 : tensor<2xi32>
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     %2 = "tf_device.launch"() ({
+// CHECK-NEXT:     %2 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:TPU:1"}> ({
 // CHECK-NEXT:       %3 = "tf.TPUExecute"(%arg1, %0#1) : (tensor<1x2xi32>, tensor<3x!tf_type.string>) -> tensor<2xi32>
 // CHECK-NEXT:       tf_device.return %3 : tensor<2xi32>
-// CHECK-NEXT:     }) {device = "/job:localhost/replica:0/task:0/device:TPU:1"} : () -> tensor<2xi32>
+// CHECK-NEXT:     }) : () -> tensor<2xi32>
 // CHECK-NEXT:     tf_device.return %2 : tensor<2xi32>
 // CHECK-NEXT:   }) : () -> (tensor<2xi32>, tensor<2xi32>)
 // CHECK-NEXT:   return %1#0, %1#1 : tensor<2xi32>, tensor<2xi32>
@@ -189,25 +189,25 @@ module attributes {dtensor.all_reduce_combiner.num_ops_in_group = 0 : i64, dtens
 // CHECK-SAME: %arg1: tensor<i32> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:1"}
 // CHECK-SAME: %arg2: tensor<!tf_type.resource<tensor<i32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}
 // CHECK-SAME: %arg3: tensor<!tf_type.resource<tensor<i32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:1"}
-// CHECK-NEXT:   %0:2 = "tf_device.launch"() ({
+// CHECK-NEXT:   %0:2 = "tf_device.launch"() <{device = ""}> ({
 // CHECK-NEXT:     %compilation_status, %program = "tf._TPUCompileMlir"() <{metadata = ""}> : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
 // CHECK-NEXT:     tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
-// CHECK-NEXT:   }) {device = ""} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
-// CHECK-NEXT:   "tf_device.launch"() ({
+// CHECK-NEXT:   }) : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
+// CHECK-NEXT:   "tf_device.launch"() <{device = ""}> ({
 // CHECK-NEXT:     "tf.TPUCompileSucceededAssert"(%0#0) : (tensor<!tf_type.string>) -> ()
 // CHECK-NEXT:     tf_device.return
-// CHECK-NEXT:   }) {device = ""} : () -> ()
+// CHECK-NEXT:   }) : () -> ()
 // CHECK-NEXT:   %1:2 = "tf_device.parallel_execute"() ({
-// CHECK-NEXT:     %2 = "tf_device.launch"() ({
+// CHECK-NEXT:     %2 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:TPU:0"}> ({
 // CHECK-NEXT:       %3 = "tf.TPUExecute"(%arg0, %0#1) : (tensor<i32>, tensor<3x!tf_type.string>) -> tensor<i32>
 // CHECK-NEXT:       tf_device.return %3 : tensor<i32>
-// CHECK-NEXT:     }) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> tensor<i32>
+// CHECK-NEXT:     }) : () -> tensor<i32>
 // CHECK-NEXT:     tf_device.return %2 : tensor<i32>
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     %2 = "tf_device.launch"() ({
+// CHECK-NEXT:     %2 = "tf_device.launch"() <{device = "/job:localhost/replica:0/task:0/device:TPU:1"}> ({
 // CHECK-NEXT:       %3 = "tf.TPUExecute"(%arg1, %0#1) : (tensor<i32>, tensor<3x!tf_type.string>) -> tensor<i32>
 // CHECK-NEXT:       tf_device.return %3 : tensor<i32>
-// CHECK-NEXT:     }) {device = "/job:localhost/replica:0/task:0/device:TPU:1"} : () -> tensor<i32>
+// CHECK-NEXT:     }) : () -> tensor<i32>
 // CHECK-NEXT:     tf_device.return %2 : tensor<i32>
 // CHECK-NEXT:   }) : () -> (tensor<i32>, tensor<i32>)
 // CHECK-NEXT:   "tf.AssignVariableOp"(%arg2, %1#0) <{validate_shape = false}> {_global_shape = [], _layout = [], device = "/job:localhost/replica:0/task:0/device:TPU:0"} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
diff --git a/tensorflow/dtensor/mlir/tests/update_tpu_metadata.mlir b/tensorflow/dtensor/mlir/tests/update_tpu_metadata.mlir
index c7632cad45a4d8..98322240757575 100644
--- a/tensorflow/dtensor/mlir/tests/update_tpu_metadata.mlir
+++ b/tensorflow/dtensor/mlir/tests/update_tpu_metadata.mlir
@@ -9,9 +9,9 @@ func.func @main() {
 
 func.func @f_callee() {
   // CHECK:    tf_device.launch
+  // CHECK:    device = ""
   // CHECK:      "tf._TPUCompileMlir"
   // CHECK-SAME:  metadata = "\0A\09\08\01\12\05\12\03\08\80\01\18\04 \01"
-  // CHECK:    device = ""
   %0:2 = "tf_device.launch"() ({
     %1, %2 = "tf._TPUCompileMlir"() {
       NumDynamicShapes = 0 : i64,
@@ -20,7 +20,7 @@ func.func @f_callee() {
     tf_device.return %1, %2 : tensor<!tf_type.string>, tensor<2x!tf_type.string>
   })  {device = "tpu_host:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
 
-  // CHECK-NEXT: "tf.TPUExecute"
+  // CHECK: "tf.TPUExecute"
   "tf.TPUExecute"(%0#1) : (tensor<2x!tf_type.string>) -> ()
   func.return
 }
@@ -36,8 +36,8 @@ func.func @main() {
 
 func.func @f_callee() {
   // CHECK:    tf_device.launch
-  // CHECK:      "tf._TPUCompileMlir"
   // CHECK:    device = ""
+  // CHECK:      "tf._TPUCompileMlir"
   %0:2 = "tf_device.launch"() ({
     %1, %2 = "tf._TPUCompileMlir"() {
       NumDynamicShapes = 0 : i64,
@@ -47,8 +47,8 @@ func.func @f_callee() {
   })  {device = "tpu_host:0"} : () -> (tensor<!tf_type.string>, tensor<2x!tf_type.string>)
 
   // CHECK:    tf_device.launch
-  // CHECK:      "tf.TPUExecute"
   // CHECK:    device = ""
+  // CHECK:      "tf.TPUExecute"
   "tf_device.launch"() ({
     "tf.TPUExecute"(%0#1) : (tensor<2x!tf_type.string>) -> ()
     tf_device.return
@@ -83,24 +83,24 @@ func.func @f_callee() {
 
 // -----
 
-// Check for Xla Spmd mesh that TPUCompileOp has correct metadata proto and 
+// Check for Xla Spmd mesh that TPUCompileOp has correct metadata proto and
 // number of program outputs is equal to number of devices on mesh.
 
 // CHECK-LABEL: func @main
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<12x24xf32>) -> (tensor<12x24xf32>) {
     %0 = "tf.StatefulPartitionedCall"(%arg1) {
-      config = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3,/job:localhost/replica:0/task:0/device:TPU:4,/job:localhost/replica:0/task:0/device:TPU:5,/job:localhost/replica:0/task:0/device:TPU:6,/job:localhost/replica:0/task:0/device:TPU:7|use_xla_spmd", 
-      config_proto = "", 
-      executor_type = "", 
+      config = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:TPU:0,/job:localhost/replica:0/task:0/device:TPU:1,/job:localhost/replica:0/task:0/device:TPU:2,/job:localhost/replica:0/task:0/device:TPU:3,/job:localhost/replica:0/task:0/device:TPU:4,/job:localhost/replica:0/task:0/device:TPU:5,/job:localhost/replica:0/task:0/device:TPU:6,/job:localhost/replica:0/task:0/device:TPU:7|use_xla_spmd",
+      config_proto = "",
+      executor_type = "",
       f = @_xla_spmd_func} : (tensor<12x24xf32>) -> tensor<12x24xf32>
     return %0 : tensor<12x24xf32>
   }
 
 func.func private @_xla_spmd_func(%arg0: tensor<12x24xf32>) -> tensor<12x24xf32> {
   // CHECK:    tf_device.launch
+  // CHECK:    device = ""
   // CHECK:      %compilation_status, %program:8 = "tf._TPUCompileMlir"
   // CHECK-SAME:  metadata = "\0A\10\08\01\12\08\12\02\08\0C\12\02\08\18\18\01\22\00\12\02\0A\00\18\01 \08x\01\88\01\ED\91\DC\F5\C3\8C\95\B5\90\01"
-  // CHECK:    device = ""
   %0:2 = "tf_device.launch"() ({
     %compilation_status, %program = "tf._TPUCompileMlir"() {metadata = "\0A\18\08\01\12\08\12\02\08\0C\12\02\08\18\18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\01 \01\88\01\ED\91\DC\F5\C3\8C\95\B5\90\01", mlir_module = "#loc = loc(unknown)\0Amodule attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1345 : i32}} {\0A  func.func @main(%arg0: tensor<12x24xf32> {mhlo.sharding = \22\22} loc(unknown)) -> (tensor<12x24xf32> {mhlo.sharding = \22\22}) {\0A    %0 = \22tf.Identity\22(%arg0) : (tensor<12x24xf32>) -> tensor<12x24xf32> loc(#loc)\0A    return %0 : tensor<12x24xf32> loc(#loc)\0A  } loc(#loc)\0A} loc(#loc)\0A"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
     tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>