diff --git a/configs/glip/README.md b/configs/glip/README.md
index 6d8a3c22983..5c3015ec011 100644
--- a/configs/glip/README.md
+++ b/configs/glip/README.md
@@ -99,3 +99,74 @@ Note:
 1. The above are zero-shot evaluation results.
 2. The evaluation metric we used is LVIS FixAP. For specific details, please refer to [Evaluating Large-Vocabulary Object Detectors: The Devil is in the Details](https://arxiv.org/pdf/2102.01066.pdf).
 3. We found that the performance on small models is better than the official results, but it is lower on large models. This is mainly due to the incomplete alignment of the GLIP post-processing.
+
+## ODinW (Object Detection in the Wild) Results
+
+Learning visual representations from natural language supervision has recently shown great promise in a number of pioneering works. In general, these language-augmented visual models demonstrate strong transferability to a variety of datasets and tasks. However, it remains challenging to evaluate the transferablity of these models due to the lack of easy-to-use evaluation toolkits and public benchmarks. To tackle this, we build ELEVATER 1 , the first benchmark and toolkit for evaluating (pre-trained) language-augmented visual models. ELEVATER is composed of three components. (i) Datasets. As downstream evaluation suites, it consists of 20 image classification datasets and 35 object detection datasets, each of which is augmented with external knowledge. (ii) Toolkit. An automatic hyper-parameter tuning toolkit is developed to facilitate model evaluation on downstream tasks. (iii) Metrics. A variety of evaluation metrics are used to measure sample-efficiency (zero-shot and few-shot) and parameter-efficiency (linear probing and full model fine-tuning). ELEVATER is platform for Computer Vision in the Wild (CVinW), and is publicly released at https://computer-vision-in-the-wild.github.io/ELEVATER/
+
+### Results and models of ODinW13
+
+| Method                | GLIP-T(A) | Official  | GLIP-T(B) | Official  | GLIP-T(C) | Official  | GroundingDINO-T | GroundingDINO-B |
+| --------------------- | --------- | --------- | --------- | --------- | --------- | --------- | --------------- | --------------- |
+| AerialMaritimeDrone   | 0.123     | 0.122     | 0.110     | 0.110     | 0.130     | 0.130     | 0.173           | 0.281           |
+| Aquarium              | 0.175     | 0.174     | 0.173     | 0.169     | 0.191     | 0.190     | 0.195           | 0.445           |
+| CottontailRabbits     | 0.686     | 0.686     | 0.688     | 0.688     | 0.744     | 0.744     | 0.799           | 0.808           |
+| EgoHands              | 0.013     | 0.013     | 0.003     | 0.004     | 0.314     | 0.315     | 0.608           | 0.764           |
+| NorthAmericaMushrooms | 0.502     | 0.502     | 0.367     | 0.367     | 0.297     | 0.296     | 0.507           | 0.675           |
+| Packages              | 0.589     | 0.589     | 0.083     | 0.083     | 0.699     | 0.699     | 0.687           | 0.670           |
+| PascalVOC             | 0.512     | 0.512     | 0.541     | 0.540     | 0.565     | 0.565     | 0.563           | 0.711           |
+| pistols               | 0.339     | 0.339     | 0.502     | 0.501     | 0.503     | 0.504     | 0.726           | 0.771           |
+| pothole               | 0.007     | 0.007     | 0.030     | 0.030     | 0.058     | 0.058     | 0.215           | 0.478           |
+| Raccoon               | 0.075     | 0.074     | 0.285     | 0.288     | 0.241     | 0.244     | 0.549           | 0.541           |
+| ShellfishOpenImages   | 0.253     | 0.253     | 0.337     | 0.338     | 0.300     | 0.302     | 0.393           | 0.650           |
+| thermalDogsAndPeople  | 0.372     | 0.372     | 0.475     | 0.475     | 0.510     | 0.510     | 0.657           | 0.633           |
+| VehiclesOpenImages    | 0.574     | 0.566     | 0.562     | 0.547     | 0.549     | 0.534     | 0.613           | 0.647           |
+| Average               | **0.325** | **0.324** | **0.320** | **0.318** | **0.392** | **0.392** | **0.514**       | **0.621**       |
+
+### Results and models of ODinW35
+
+| Method                      | GLIP-T(A) | Official  | GLIP-T(B) | Official  | GLIP-T(C) | Official  | GroundingDINO-T | GroundingDINO-B |
+| --------------------------- | --------- | --------- | --------- | --------- | --------- | --------- | --------------- | --------------- |
+| AerialMaritimeDrone_large   | 0.123     | 0.122     | 0.110     | 0.110     | 0.130     | 0.130     | 0.173           | 0.281           |
+| AerialMaritimeDrone_tiled   | 0.174     | 0.174     | 0.172     | 0.172     | 0.172     | 0.172     | 0.206           | 0.364           |
+| AmericanSignLanguageLetters | 0.001     | 0.001     | 0.003     | 0.003     | 0.009     | 0.009     | 0.002           | 0.096           |
+| Aquarium                    | 0.175     | 0.175     | 0.173     | 0.171     | 0.192     | 0.182     | 0.195           | 0.445           |
+| BCCD                        | 0.016     | 0.016     | 0.001     | 0.001     | 0.000     | 0.000     | 0.161           | 0.584           |
+| boggleBoards                | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000           | 0.134           |
+| brackishUnderwater          | 0.016     | 0..013    | 0.021     | 0.027     | 0.020     | 0.022     | 0.021           | 0.454           |
+| ChessPieces                 | 0.001     | 0.001     | 0.000     | 0.000     | 0.001     | 0.001     | 0.000           | 0.000           |
+| CottontailRabbits           | 0.710     | 0.709     | 0.683     | 0.683     | 0.752     | 0.752     | 0.806           | 0.797           |
+| dice                        | 0.005     | 0.005     | 0.004     | 0.004     | 0.004     | 0.004     | 0.004           | 0.082           |
+| DroneControl                | 0.016     | 0.017     | 0.006     | 0.008     | 0.005     | 0.007     | 0.042           | 0.638           |
+| EgoHands_generic            | 0.009     | 0.010     | 0.005     | 0.006     | 0.510     | 0.508     | 0.608           | 0.764           |
+| EgoHands_specific           | 0.001     | 0.001     | 0.004     | 0.006     | 0.003     | 0.004     | 0.002           | 0.687           |
+| HardHatWorkers              | 0.029     | 0.029     | 0.023     | 0.023     | 0.033     | 0.033     | 0.046           | 0.439           |
+| MaskWearing                 | 0.007     | 0.007     | 0.003     | 0.002     | 0.005     | 0.005     | 0.004           | 0.406           |
+| MountainDewCommercial       | 0.218     | 0.227     | 0.199     | 0.197     | 0.478     | 0.463     | 0.430           | 0.580           |
+| NorthAmericaMushrooms       | 0.502     | 0.502     | 0.450     | 0.450     | 0.497     | 0.497     | 0.471           | 0.501           |
+| openPoetryVision            | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000           | 0.051           |
+| OxfordPets_by_breed         | 0.001     | 0.002     | 0.002     | 0.004     | 0.001     | 0.002     | 0.003           | 0.799           |
+| OxfordPets_by_species       | 0.016     | 0.011     | 0.012     | 0.009     | 0.013     | 0.009     | 0.011           | 0.872           |
+| PKLot                       | 0.002     | 0.002     | 0.000     | 0.000     | 0.000     | 0.000     | 0.001           | 0.774           |
+| Packages                    | 0.569     | 0.569     | 0.279     | 0.279     | 0.712     | 0.712     | 0.695           | 0.728           |
+| PascalVOC                   | 0.512     | 0.512     | 0.541     | 0.540     | 0.565     | 0.565     | 0.563           | 0.711           |
+| pistols                     | 0.339     | 0.339     | 0.502     | 0.501     | 0.503     | 0.504     | 0.726           | 0.771           |
+| plantdoc                    | 0.002     | 0.002     | 0.007     | 0.007     | 0.009     | 0.009     | 0.005           | 0.376           |
+| pothole                     | 0.007     | 0.010     | 0.024     | 0.025     | 0.085     | 0.101     | 0.215           | 0.478           |
+| Raccoons                    | 0.075     | 0.074     | 0.285     | 0.288     | 0.241     | 0.244     | 0.549           | 0.541           |
+| selfdrivingCar              | 0.071     | 0.072     | 0.074     | 0.074     | 0.081     | 0.080     | 0.089           | 0.318           |
+| ShellfishOpenImages         | 0.253     | 0.253     | 0.337     | 0.338     | 0.300     | 0.302     | 0.393           | 0.650           |
+| ThermalCheetah              | 0.028     | 0.028     | 0.000     | 0.000     | 0.028     | 0.028     | 0.087           | 0.290           |
+| thermalDogsAndPeople        | 0.372     | 0.372     | 0.475     | 0.475     | 0.510     | 0.510     | 0.657           | 0.633           |
+| UnoCards                    | 0.000     | 0.000     | 0.000     | 0.001     | 0.002     | 0.003     | 0.006           | 0.754           |
+| VehiclesOpenImages          | 0.574     | 0.566     | 0.562     | 0.547     | 0.549     | 0.534     | 0.613           | 0.647           |
+| WildfireSmoke               | 0.000     | 0.000     | 0.000     | 0.000     | 0.017     | 0.017     | 0.134           | 0.410           |
+| websiteScreenshots          | 0.003     | 0.004     | 0.003     | 0.005     | 0.005     | 0.006     | 0.012           | 0.175           |
+| Average                     | **0.134** | **0.134** | **0.138** | **0.138** | **0.179** | **0.178** | **0.227**       | **0.492**       |
+
+### Results on Flickr30k
+
+| Model         | Official | Pre-Train Data | Val R@1 | Val R@5 | Val R@10 | Test R@1 | Test R@5 | Test R@10 |
+| ------------- | -------- | -------------- | ------- | ------- | -------- | -------- | -------- | --------- |
+| **GLIP-T(C)** | ✔        | O365, GoldG    | 84.8    | 94.9    | 96.3     | 85.5     | 95.4     | 96.6      |
+| **GLIP-T(C)** |          | O365, GoldG    | 84.9    | 94.9    | 96.3     | 85.6     | 95.4     | 96.7      |
diff --git a/configs/glip/flickr30k/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg_zeroshot_flickr30k.py b/configs/glip/flickr30k/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg_zeroshot_flickr30k.py
new file mode 100644
index 00000000000..c494bfcdec5
--- /dev/null
+++ b/configs/glip/flickr30k/glip_atss_swin-t_c_fpn_dyhead_pretrain_obj365-goldg_zeroshot_flickr30k.py
@@ -0,0 +1,61 @@
+_base_ = '../glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py'
+
+lang_model_name = 'bert-base-uncased'
+
+model = dict(bbox_head=dict(early_fuse=True), )
+
+dataset_type = 'Flickr30kDataset'
+data_root = 'data/flickr30k/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive', 'phrase_ids', 'phrases'))
+]
+
+dataset_Flickr30k_val = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='mdetr_annotations/final_flickr_separateGT_val.json',
+    data_prefix=dict(img='flickr30k_images/'),
+    pipeline=test_pipeline,
+)
+
+dataset_Flickr30k_test = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='mdetr_annotations/final_flickr_separateGT_test.json',
+    data_prefix=dict(img='flickr30k_images/'),
+    pipeline=test_pipeline,
+)
+
+val_evaluator_Flickr30k = dict(type='Flickr30kMetric', )
+
+test_evaluator_Flickr30k = dict(type='Flickr30kMetric', )
+
+# ----------Config---------- #
+dataset_prefixes = ['Flickr30kVal', 'Flickr30kTest']
+datasets = [dataset_Flickr30k_val, dataset_Flickr30k_test]
+metrics = [val_evaluator_Flickr30k, test_evaluator_Flickr30k]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/configs/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw13.py b/configs/glip/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw13.py
similarity index 99%
rename from configs/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw13.py
rename to configs/glip/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw13.py
index 6c2cc0c6f09..d38effba8c1 100644
--- a/configs/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw13.py
+++ b/configs/glip/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw13.py
@@ -1,4 +1,4 @@
-_base_ = '../glip/glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py'
+_base_ = '../glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py'
 
 dataset_type = 'CocoDataset'
 data_root = 'data/odinw/'
diff --git a/configs/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw35.py b/configs/glip/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw35.py
similarity index 99%
rename from configs/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw35.py
rename to configs/glip/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw35.py
index 5e640d901bd..2eaf09ed771 100644
--- a/configs/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw35.py
+++ b/configs/glip/odinw/glip_atss_swin-t_a_fpn_dyhead_pretrain_odinw35.py
@@ -1,4 +1,4 @@
-_base_ = '../glip/glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py'
+_base_ = '../glip_atss_swin-t_a_fpn_dyhead_pretrain_obj365.py'
 
 dataset_type = 'CocoDataset'
 data_root = 'data/odinw/'
@@ -518,7 +518,7 @@
 caption_prompt = {
     'pothole': {
         'name': 'holes',
-        'prefix': 'there are some',
+        'prefix': 'there are some ',
         'suffix': ' on the road'
     }
 }
diff --git a/configs/odinw/glip_atss_swin-t_bc_fpn_dyhead_pretrain_odinw13.py b/configs/glip/odinw/glip_atss_swin-t_bc_fpn_dyhead_pretrain_odinw13.py
similarity index 100%
rename from configs/odinw/glip_atss_swin-t_bc_fpn_dyhead_pretrain_odinw13.py
rename to configs/glip/odinw/glip_atss_swin-t_bc_fpn_dyhead_pretrain_odinw13.py
diff --git a/configs/odinw/glip_atss_swin-t_bc_fpn_dyhead_pretrain_odinw35.py b/configs/glip/odinw/glip_atss_swin-t_bc_fpn_dyhead_pretrain_odinw35.py
similarity index 100%
rename from configs/odinw/glip_atss_swin-t_bc_fpn_dyhead_pretrain_odinw35.py
rename to configs/glip/odinw/glip_atss_swin-t_bc_fpn_dyhead_pretrain_odinw35.py
diff --git a/configs/odinw/override_category.py b/configs/glip/odinw/override_category.py
similarity index 100%
rename from configs/odinw/override_category.py
rename to configs/glip/odinw/override_category.py
diff --git a/configs/grounding_dino/README.md b/configs/grounding_dino/README.md
index 715b630cc79..2a527828a46 100644
--- a/configs/grounding_dino/README.md
+++ b/configs/grounding_dino/README.md
@@ -59,7 +59,7 @@ python demo/image_demo.py \
 <img src="https://github.com/open-mmlab/mmdetection/assets/42299757/3a3bd6f1-e2ed-43d4-aa22-0bb07ee6f20b"/>
 </div>
 
-## Results and Models
+## COCO Results and Models
 
 |       Model        | Backbone |   Style   |  COCO mAP  | Official COCO mAP |                  Pre-Train Data                  |                             Config                             |                                                                                                                                                                                                                                         Download                                                                                                                                                                                                                                          |
 | :----------------: | :------: | :-------: | :--------: | :---------------: | :----------------------------------------------: | :------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
@@ -75,6 +75,151 @@ Note:
 2. Finetune refers to fine-tuning on the COCO 2017 dataset. The R50 model is trained using 8 NVIDIA GeForce 3090 GPUs, while the remaining models are trained using 16 NVIDIA GeForce 3090 GPUs. The GPU memory usage is approximately 8.5GB.
 3. Our performance is higher than the official model due to two reasons: we modified the initialization strategy and introduced a log scaler.
 
+## LVIS Results
+
+|      Model       | MiniVal APr | MiniVal APc | MiniVal APf | MiniVal AP | Val1.0 APr | Val1.0 APc | Val1.0 APf | Val1.0 AP |                  Pre-Train Data                  |                               Config                                |                                                        Download                                                        |
+| :--------------: | :---------: | :---------: | :---------: | :--------: | :--------: | :--------: | :--------: | :-------: | :----------------------------------------------: | :-----------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------: |
+| Grounding DINO-T |    18.8     |    24.2     |    34.7     |    28.8    |    10.1    |    15.3    |    29.9    |   20.1    |                 O365,GoldG,Cap4M                 | [config](lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py) |   [model](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth)   |
+| Grounding DINO-B |    27.9     |    33.4     |    37.2     |    34.7    |    19.0    |    24.1    |    32.9    |   26.7    | COCO,O365,GoldG,Cap4M,OpenImage,ODinW-35,RefCOCO | [config](lvis/grounding_dino_swin-b_pretrain_zeroshot_mini-lvis.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth) |
+
+Note:
+
+1. The above are zero-shot evaluation results.
+2. The evaluation metric we used is LVIS FixAP. For specific details, please refer to [Evaluating Large-Vocabulary Object Detectors: The Devil is in the Details](https://arxiv.org/pdf/2102.01066.pdf).
+
+## ODinW (Object Detection in the Wild) Results
+
+Learning visual representations from natural language supervision has recently shown great promise in a number of pioneering works. In general, these language-augmented visual models demonstrate strong transferability to a variety of datasets and tasks. However, it remains challenging to evaluate the transferablity of these models due to the lack of easy-to-use evaluation toolkits and public benchmarks. To tackle this, we build ELEVATER 1 , the first benchmark and toolkit for evaluating (pre-trained) language-augmented visual models. ELEVATER is composed of three components. (i) Datasets. As downstream evaluation suites, it consists of 20 image classification datasets and 35 object detection datasets, each of which is augmented with external knowledge. (ii) Toolkit. An automatic hyper-parameter tuning toolkit is developed to facilitate model evaluation on downstream tasks. (iii) Metrics. A variety of evaluation metrics are used to measure sample-efficiency (zero-shot and few-shot) and parameter-efficiency (linear probing and full model fine-tuning). ELEVATER is platform for Computer Vision in the Wild (CVinW), and is publicly released at https://computer-vision-in-the-wild.github.io/ELEVATER/
+
+### Results and models of ODinW13
+
+| Method                | GLIP-T(A) | Official  | GLIP-T(B) | Official  | GLIP-T(C) | Official  | GroundingDINO-T | GroundingDINO-B |
+| --------------------- | --------- | --------- | --------- | --------- | --------- | --------- | --------------- | --------------- |
+| AerialMaritimeDrone   | 0.123     | 0.122     | 0.110     | 0.110     | 0.130     | 0.130     | 0.173           | 0.281           |
+| Aquarium              | 0.175     | 0.174     | 0.173     | 0.169     | 0.191     | 0.190     | 0.195           | 0.445           |
+| CottontailRabbits     | 0.686     | 0.686     | 0.688     | 0.688     | 0.744     | 0.744     | 0.799           | 0.808           |
+| EgoHands              | 0.013     | 0.013     | 0.003     | 0.004     | 0.314     | 0.315     | 0.608           | 0.764           |
+| NorthAmericaMushrooms | 0.502     | 0.502     | 0.367     | 0.367     | 0.297     | 0.296     | 0.507           | 0.675           |
+| Packages              | 0.589     | 0.589     | 0.083     | 0.083     | 0.699     | 0.699     | 0.687           | 0.670           |
+| PascalVOC             | 0.512     | 0.512     | 0.541     | 0.540     | 0.565     | 0.565     | 0.563           | 0.711           |
+| pistols               | 0.339     | 0.339     | 0.502     | 0.501     | 0.503     | 0.504     | 0.726           | 0.771           |
+| pothole               | 0.007     | 0.007     | 0.030     | 0.030     | 0.058     | 0.058     | 0.215           | 0.478           |
+| Raccoon               | 0.075     | 0.074     | 0.285     | 0.288     | 0.241     | 0.244     | 0.549           | 0.541           |
+| ShellfishOpenImages   | 0.253     | 0.253     | 0.337     | 0.338     | 0.300     | 0.302     | 0.393           | 0.650           |
+| thermalDogsAndPeople  | 0.372     | 0.372     | 0.475     | 0.475     | 0.510     | 0.510     | 0.657           | 0.633           |
+| VehiclesOpenImages    | 0.574     | 0.566     | 0.562     | 0.547     | 0.549     | 0.534     | 0.613           | 0.647           |
+| Average               | **0.325** | **0.324** | **0.320** | **0.318** | **0.392** | **0.392** | **0.514**       | **0.621**       |
+
+### Results and models of ODinW35
+
+| Method                      | GLIP-T(A) | Official  | GLIP-T(B) | Official  | GLIP-T(C) | Official  | GroundingDINO-T | GroundingDINO-B |
+| --------------------------- | --------- | --------- | --------- | --------- | --------- | --------- | --------------- | --------------- |
+| AerialMaritimeDrone_large   | 0.123     | 0.122     | 0.110     | 0.110     | 0.130     | 0.130     | 0.173           | 0.281           |
+| AerialMaritimeDrone_tiled   | 0.174     | 0.174     | 0.172     | 0.172     | 0.172     | 0.172     | 0.206           | 0.364           |
+| AmericanSignLanguageLetters | 0.001     | 0.001     | 0.003     | 0.003     | 0.009     | 0.009     | 0.002           | 0.096           |
+| Aquarium                    | 0.175     | 0.175     | 0.173     | 0.171     | 0.192     | 0.182     | 0.195           | 0.445           |
+| BCCD                        | 0.016     | 0.016     | 0.001     | 0.001     | 0.000     | 0.000     | 0.161           | 0.584           |
+| boggleBoards                | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000           | 0.134           |
+| brackishUnderwater          | 0.016     | 0..013    | 0.021     | 0.027     | 0.020     | 0.022     | 0.021           | 0.454           |
+| ChessPieces                 | 0.001     | 0.001     | 0.000     | 0.000     | 0.001     | 0.001     | 0.000           | 0.000           |
+| CottontailRabbits           | 0.710     | 0.709     | 0.683     | 0.683     | 0.752     | 0.752     | 0.806           | 0.797           |
+| dice                        | 0.005     | 0.005     | 0.004     | 0.004     | 0.004     | 0.004     | 0.004           | 0.082           |
+| DroneControl                | 0.016     | 0.017     | 0.006     | 0.008     | 0.005     | 0.007     | 0.042           | 0.638           |
+| EgoHands_generic            | 0.009     | 0.010     | 0.005     | 0.006     | 0.510     | 0.508     | 0.608           | 0.764           |
+| EgoHands_specific           | 0.001     | 0.001     | 0.004     | 0.006     | 0.003     | 0.004     | 0.002           | 0.687           |
+| HardHatWorkers              | 0.029     | 0.029     | 0.023     | 0.023     | 0.033     | 0.033     | 0.046           | 0.439           |
+| MaskWearing                 | 0.007     | 0.007     | 0.003     | 0.002     | 0.005     | 0.005     | 0.004           | 0.406           |
+| MountainDewCommercial       | 0.218     | 0.227     | 0.199     | 0.197     | 0.478     | 0.463     | 0.430           | 0.580           |
+| NorthAmericaMushrooms       | 0.502     | 0.502     | 0.450     | 0.450     | 0.497     | 0.497     | 0.471           | 0.501           |
+| openPoetryVision            | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000           | 0.051           |
+| OxfordPets_by_breed         | 0.001     | 0.002     | 0.002     | 0.004     | 0.001     | 0.002     | 0.003           | 0.799           |
+| OxfordPets_by_species       | 0.016     | 0.011     | 0.012     | 0.009     | 0.013     | 0.009     | 0.011           | 0.872           |
+| PKLot                       | 0.002     | 0.002     | 0.000     | 0.000     | 0.000     | 0.000     | 0.001           | 0.774           |
+| Packages                    | 0.569     | 0.569     | 0.279     | 0.279     | 0.712     | 0.712     | 0.695           | 0.728           |
+| PascalVOC                   | 0.512     | 0.512     | 0.541     | 0.540     | 0.565     | 0.565     | 0.563           | 0.711           |
+| pistols                     | 0.339     | 0.339     | 0.502     | 0.501     | 0.503     | 0.504     | 0.726           | 0.771           |
+| plantdoc                    | 0.002     | 0.002     | 0.007     | 0.007     | 0.009     | 0.009     | 0.005           | 0.376           |
+| pothole                     | 0.007     | 0.010     | 0.024     | 0.025     | 0.085     | 0.101     | 0.215           | 0.478           |
+| Raccoons                    | 0.075     | 0.074     | 0.285     | 0.288     | 0.241     | 0.244     | 0.549           | 0.541           |
+| selfdrivingCar              | 0.071     | 0.072     | 0.074     | 0.074     | 0.081     | 0.080     | 0.089           | 0.318           |
+| ShellfishOpenImages         | 0.253     | 0.253     | 0.337     | 0.338     | 0.300     | 0.302     | 0.393           | 0.650           |
+| ThermalCheetah              | 0.028     | 0.028     | 0.000     | 0.000     | 0.028     | 0.028     | 0.087           | 0.290           |
+| thermalDogsAndPeople        | 0.372     | 0.372     | 0.475     | 0.475     | 0.510     | 0.510     | 0.657           | 0.633           |
+| UnoCards                    | 0.000     | 0.000     | 0.000     | 0.001     | 0.002     | 0.003     | 0.006           | 0.754           |
+| VehiclesOpenImages          | 0.574     | 0.566     | 0.562     | 0.547     | 0.549     | 0.534     | 0.613           | 0.647           |
+| WildfireSmoke               | 0.000     | 0.000     | 0.000     | 0.000     | 0.017     | 0.017     | 0.134           | 0.410           |
+| websiteScreenshots          | 0.003     | 0.004     | 0.003     | 0.005     | 0.005     | 0.006     | 0.012           | 0.175           |
+| Average                     | **0.134** | **0.134** | **0.138** | **0.138** | **0.179** | **0.178** | **0.227**       | **0.492**       |
+
+## Flickr30k Results
+
+|      Model       |  Pre-Train Data  | Val R@1 | Val R@5 | Val R@10 | Tesst R@1 | Test R@5 | Test R@10 |                          Config                           |                                                                                                                                                                                                                                         Download                                                                                                                                                                                                                                          |
+| :--------------: | :--------------: | ------- | ------- | -------- | --------- | -------- | --------- | :-------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Grounding DINO-T | O365,GoldG,Cap4M | 87.8    | 96.6    | 98.0     | 88.1      | 96.9     | 98.2      | [config](grounding_dino_swin-t_finetune_16xb2_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco/grounding_dino_swin-t_finetune_16xb2_1x_coco_20230921_152544-5f234b20.pth)                                                                                                \| [log](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco/grounding_dino_swin-t_finetune_16xb2_1x_coco_20230921_152544.log.json) |
+
+Note:
+
+1. `@1,5,10` refers to precision at the top 1, 5, and 10 positions in a predicted ranked list.
+2. The pretraining data used by Grounding DINO-T is `O365,GoldG,Cap4M`, and the corresponding evaluation configuration is (grounding_dino_swin-t_pretrain_zeroshot_refcoco)\[refcoco/grounding_dino_swin-t_pretrain_zeroshot_refcoco.py\].
+
+Test Command
+
+```shell
+cd mmdetection
+bash tools/dist_test.sh configs/grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py checkpoints/groundingdino_swint_ogc_mmdet-822d7e9d.pth 8
+```
+
+## Referring Expression Comprehension Results
+
+| Method                                  | Grounding DINO-T <br/> (O365,GoldG,Cap4M) | Grounding DINO-B <br/> (COCO,O365,GoldG,Cap4M,OpenImage,ODinW-35,RefCOCO) |
+| --------------------------------------- | ----------------------------------------- | ------------------------------------------------------------------------- |
+| RefCOCO val @1,5,10                     | 50.77/89.45/94.86                         | 84.61/97.88/99.10                                                         |
+| RefCOCO testA @1,5,10                   | 57.45/91.29/95.62                         | 88.65/98.89/99.63                                                         |
+| RefCOCO testB @1,5,10                   | 44.97/86.54/92.88                         | 80.51/96.64/98.51                                                         |
+| RefCOCO+ val @1,5,10                    | 51.64/86.35/92.57                         | 73.67/96.60/98.65                                                         |
+| RefCOCO+ testA @1,5,10                  | 57.25/86.74/92.65                         | 82.19/97.92/99.09                                                         |
+| RefCOCO+ testB @1,5,10                  | 46.35/84.05/90.67                         | 64.10/94.25/97.46                                                         |
+| RefCOCOg val @1,5,10                    | 60.42/92.10/96.18                         | 78.33/97.28/98.57                                                         |
+| RefCOCOg test @1,5,10                   | 59.74/92.08/96.28                         | 78.11/97.06/98.65                                                         |
+| gRefCOCO val Pr@(F1=1, IoU≥0.5),N-acc   | 41.32/91.82                               | 46.18/81.44                                                               |
+| gRefCOCO testA Pr@(F1=1, IoU≥0.5),N-acc | 27.23/90.24                               | 38.60/76.06                                                               |
+| gRefCOCO testB Pr@(F1=1, IoU≥0.5),N-acc | 29.70/93.49                               | 35.87/80.58                                                               |
+
+Note:
+
+1. `@1,5,10` refers to precision at the top 1, 5, and 10 positions in a predicted ranked list.
+2. `Pr@(F1=1, IoU≥0.5),N-acc` from the paper [GREC: Generalized Referring Expression Comprehension](https://arxiv.org/pdf/2308.16182.pdf)
+3. The pretraining data used by Grounding DINO-T is `O365,GoldG,Cap4M`, and the corresponding evaluation configuration is (grounding_dino_swin-t_pretrain_zeroshot_refcoco)\[refcoco/grounding_dino_swin-t_pretrain_zeroshot_refcoco.py\].
+4. The pretraining data used by Grounding DINO-B is `COCO,O365,GoldG,Cap4M,OpenImage,ODinW-35,RefCOCO`, and the corresponding evaluation configuration is (grounding_dino_swin-t_pretrain_zeroshot_refcoco)\[refcoco/grounding_dino_swin-b_pretrain_zeroshot_refcoco.py\].
+
+Test Command
+
+```shell
+cd mmdetection
+./tools/dist_test.sh configs/grounding_dino/refcoco/grounding_dino_swin-t_pretrain_zeroshot_refexp.py https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth 8
+./tools/dist_test.sh configs/grounding_dino/refcoco/grounding_dino_swin-b_pretrain_zeroshot_refexp.py https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth 8
+```
+
+## Description Detection Dataset
+
+```shell
+pip install ddd-dataset
+```
+
+| Method                           | mode     | Grounding DINO-T <br/> (O365,GoldG,Cap4M) | Grounding DINO-B <br/> (COCO,O365,GoldG,Cap4M,OpenImage,ODinW-35,RefCOCO) |
+| -------------------------------- | -------- | ----------------------------------------- | ------------------------------------------------------------------------- |
+| FULL/short/middle/long/very long | concat   | 17.2/18.0/18.7/14.8/16.3                  | 20.2/20.4/21.1/18.8/19.8                                                  |
+| FULL/short/middle/long/very long | parallel | 22.3/28.2/24.8/19.1/13.9                  | 25.0/26.4/27.2/23.5/19.7                                                  |
+| PRES/short/middle/long/very long | concat   | 17.8/18.3/19.2/15.2/17.3                  | 20.7/21.7/21.4/19.1/20.3                                                  |
+| PRES/short/middle/long/very long | parallel | 21.0/27.0/22.8/17.5/12.5                  | 23.7/25.8/25.1/21.9/19.3                                                  |
+| ABS/short/middle/long/very long  | concat   | 15.4/17.1/16.4/13.6/14.9                  | 18.6/16.1/19.7/18.1/19.1                                                  |
+| ABS/short/middle/long/very long  | parallel | 26.0/32.0/33.0/23.6/15.5                  | 28.8/28.1/35.8/28.2/20.2                                                  |
+
+Note:
+
+1. Considering that the evaluation time for Inter-scenario is very long and the performance is low, it is temporarily not supported. The mentioned metrics are for Intra-scenario.
+2. `concat` is the default inference mode for Grounding DINO, where it concatenates multiple sub-sentences with "." to form a single sentence for inference. On the other hand, "parallel" performs inference on each sub-sentence in a for-loop.
+
 ## Custom Dataset
 
 To facilitate fine-tuning on custom datasets, we use a simple cat dataset as an example, as shown in the following steps.
diff --git a/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py b/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py
new file mode 100644
index 00000000000..ac655b74aa6
--- /dev/null
+++ b/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py
@@ -0,0 +1,14 @@
+_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py'
+
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_parallel_dod.py b/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_parallel_dod.py
new file mode 100644
index 00000000000..9a1c8f2ac74
--- /dev/null
+++ b/configs/grounding_dino/dod/grounding_dino_swin-b_pretrain_zeroshot_parallel_dod.py
@@ -0,0 +1,3 @@
+_base_ = 'grounding_dino_swin-b_pretrain_zeroshot_concat_dod.py'
+
+model = dict(test_cfg=dict(chunked_size=1))
diff --git a/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py b/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py
new file mode 100644
index 00000000000..bb418011bf4
--- /dev/null
+++ b/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py
@@ -0,0 +1,78 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
+
+data_root = 'data/d3/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities', 'sent_ids'))
+]
+
+# -------------------------------------------------#
+val_dataset_full = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_full_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+
+val_evaluator_full = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_full_annotations.json')
+
+# -------------------------------------------------#
+val_dataset_pres = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_pres_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+val_evaluator_pres = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_pres_annotations.json')
+
+# -------------------------------------------------#
+val_dataset_abs = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_abs_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+val_evaluator_abs = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_abs_annotations.json')
+
+# -------------------------------------------------#
+datasets = [val_dataset_full, val_dataset_pres, val_dataset_abs]
+dataset_prefixes = ['FULL', 'PRES', 'ABS']
+metrics = [val_evaluator_full, val_evaluator_pres, val_evaluator_abs]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_parallel_dod.py b/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_parallel_dod.py
new file mode 100644
index 00000000000..3d680091162
--- /dev/null
+++ b/configs/grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_parallel_dod.py
@@ -0,0 +1,3 @@
+_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py'
+
+model = dict(test_cfg=dict(chunked_size=1))
diff --git a/configs/grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py b/configs/grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py
new file mode 100644
index 00000000000..e2df152fef4
--- /dev/null
+++ b/configs/grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py
@@ -0,0 +1,57 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
+
+dataset_type = 'Flickr30kDataset'
+data_root = 'data/flickr30k/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive', 'phrase_ids', 'phrases'))
+]
+
+dataset_Flickr30k_val = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='mdetr_annotations/final_flickr_separateGT_val.json',
+    data_prefix=dict(img='flickr30k_images/'),
+    pipeline=test_pipeline,
+)
+
+dataset_Flickr30k_test = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='mdetr_annotations/final_flickr_separateGT_test.json',
+    data_prefix=dict(img='flickr30k_images/'),
+    pipeline=test_pipeline,
+)
+
+val_evaluator_Flickr30k = dict(type='Flickr30kMetric')
+
+test_evaluator_Flickr30k = dict(type='Flickr30kMetric')
+
+# ----------Config---------- #
+dataset_prefixes = ['Flickr30kVal', 'Flickr30kTest']
+datasets = [dataset_Flickr30k_val, dataset_Flickr30k_test]
+metrics = [val_evaluator_Flickr30k, test_evaluator_Flickr30k]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py b/configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py
index 1117cb06d39..7448764ef7e 100644
--- a/configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py
+++ b/configs/grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py
@@ -119,7 +119,8 @@
     dict(
         type='PackDetInputs',
         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
-                   'scale_factor', 'text', 'custom_entities'))
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive'))
 ]
 
 val_dataloader = dict(
diff --git a/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_lvis.py b/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_lvis.py
new file mode 100644
index 00000000000..6084159044e
--- /dev/null
+++ b/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_lvis.py
@@ -0,0 +1,14 @@
+_base_ = './grounding_dino_swin-t_pretrain_zeroshot_lvis.py'
+
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_mini-lvis.py b/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_mini-lvis.py
new file mode 100644
index 00000000000..68467a7237c
--- /dev/null
+++ b/configs/grounding_dino/lvis/grounding_dino_swin-b_pretrain_zeroshot_mini-lvis.py
@@ -0,0 +1,14 @@
+_base_ = './grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py'
+
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_lvis.py b/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_lvis.py
new file mode 100644
index 00000000000..3d05f0ce1c0
--- /dev/null
+++ b/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_lvis.py
@@ -0,0 +1,24 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/coco/'
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type=dataset_type,
+        ann_file='annotations/lvis_od_val.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+# numpy < 1.24.0
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root + 'annotations/lvis_od_val.json')
+test_evaluator = val_evaluator
diff --git a/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py b/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py
new file mode 100644
index 00000000000..0aac6cf33a9
--- /dev/null
+++ b/configs/grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py
@@ -0,0 +1,25 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/coco/'
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type=dataset_type,
+        ann_file='annotations/lvis_v1_minival_inserted_image_name.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+# numpy < 1.24.0
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root +
+    'annotations/lvis_v1_minival_inserted_image_name.json')
+test_evaluator = val_evaluator
diff --git a/configs/odinw/grounding_dino_swin-b_pretrain_odinw13.py b/configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw13.py
similarity index 99%
rename from configs/odinw/grounding_dino_swin-b_pretrain_odinw13.py
rename to configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw13.py
index b853d23fafe..65a6bc2a078 100644
--- a/configs/odinw/grounding_dino_swin-b_pretrain_odinw13.py
+++ b/configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw13.py
@@ -1,4 +1,4 @@
-_base_ = '../grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py'
+_base_ = '../grounding_dino_swin-b_pretrain_mixeddata.py'
 
 dataset_type = 'CocoDataset'
 data_root = 'data/odinw/'
diff --git a/configs/odinw/grounding_dino_swin-b_pretrain_odinw35.py b/configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw35.py
similarity index 99%
rename from configs/odinw/grounding_dino_swin-b_pretrain_odinw35.py
rename to configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw35.py
index a4b546b5998..e73cd8e61ba 100644
--- a/configs/odinw/grounding_dino_swin-b_pretrain_odinw35.py
+++ b/configs/grounding_dino/odinw/grounding_dino_swin-b_pretrain_odinw35.py
@@ -1,4 +1,4 @@
-_base_ = '../grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py'
+_base_ = '../grounding_dino_swin-b_pretrain_mixeddata.py'
 
 dataset_type = 'CocoDataset'
 data_root = 'data/odinw/'
diff --git a/configs/odinw/grounding_dino_swin-t_pretrain_odinw13.py b/configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py
similarity index 99%
rename from configs/odinw/grounding_dino_swin-t_pretrain_odinw13.py
rename to configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py
index 6421ffc24ab..216b8059726 100644
--- a/configs/odinw/grounding_dino_swin-t_pretrain_odinw13.py
+++ b/configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py
@@ -1,4 +1,4 @@
-_base_ = '../grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'  # noqa
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'  # noqa
 
 dataset_type = 'CocoDataset'
 data_root = 'data/odinw/'
diff --git a/configs/odinw/grounding_dino_swin-t_pretrain_odinw35.py b/configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw35.py
similarity index 99%
rename from configs/odinw/grounding_dino_swin-t_pretrain_odinw35.py
rename to configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw35.py
index 78a3d8626c0..3df0394a204 100644
--- a/configs/odinw/grounding_dino_swin-t_pretrain_odinw35.py
+++ b/configs/grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw35.py
@@ -1,4 +1,4 @@
-_base_ = '../grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'  # noqa
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'  # noqa
 
 dataset_type = 'CocoDataset'
 data_root = 'data/odinw/'
@@ -519,7 +519,7 @@
 caption_prompt = {
     'pothole': {
         'name': 'holes',
-        'prefix': 'there are some',
+        'prefix': 'there are some ',
         'suffix': ' on the road'
     }
 }
diff --git a/configs/grounding_dino/odinw/override_category.py b/configs/grounding_dino/odinw/override_category.py
new file mode 100644
index 00000000000..9ff05fc6e5e
--- /dev/null
+++ b/configs/grounding_dino/odinw/override_category.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import mmengine
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Override Category')
+    parser.add_argument('data_root')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    ChessPieces = [{
+        'id': 1,
+        'name': '  ',
+        'supercategory': 'pieces'
+    }, {
+        'id': 2,
+        'name': 'black bishop',
+        'supercategory': 'pieces'
+    }, {
+        'id': 3,
+        'name': 'black king',
+        'supercategory': 'pieces'
+    }, {
+        'id': 4,
+        'name': 'black knight',
+        'supercategory': 'pieces'
+    }, {
+        'id': 5,
+        'name': 'black pawn',
+        'supercategory': 'pieces'
+    }, {
+        'id': 6,
+        'name': 'black queen',
+        'supercategory': 'pieces'
+    }, {
+        'id': 7,
+        'name': 'black rook',
+        'supercategory': 'pieces'
+    }, {
+        'id': 8,
+        'name': 'white bishop',
+        'supercategory': 'pieces'
+    }, {
+        'id': 9,
+        'name': 'white king',
+        'supercategory': 'pieces'
+    }, {
+        'id': 10,
+        'name': 'white knight',
+        'supercategory': 'pieces'
+    }, {
+        'id': 11,
+        'name': 'white pawn',
+        'supercategory': 'pieces'
+    }, {
+        'id': 12,
+        'name': 'white queen',
+        'supercategory': 'pieces'
+    }, {
+        'id': 13,
+        'name': 'white rook',
+        'supercategory': 'pieces'
+    }]
+
+    _data_root = args.data_root + 'ChessPieces/Chess Pieces.v23-raw.coco/'
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = ChessPieces
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+    CottontailRabbits = [{
+        'id': 1,
+        'name': 'rabbit',
+        'supercategory': 'Cottontail-Rabbit'
+    }]
+
+    _data_root = args.data_root + 'CottontailRabbits/'
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = CottontailRabbits
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+    NorthAmericaMushrooms = [{
+        'id': 1,
+        'name': 'flat mushroom',
+        'supercategory': 'mushroom'
+    }, {
+        'id': 2,
+        'name': 'yellow mushroom',
+        'supercategory': 'mushroom'
+    }]
+
+    _data_root = args.data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = NorthAmericaMushrooms
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/configs/grounding_dino/refcoco/grounding_dino_swin-b_pretrain_zeroshot_refexp.py b/configs/grounding_dino/refcoco/grounding_dino_swin-b_pretrain_zeroshot_refexp.py
new file mode 100644
index 00000000000..dea0bad08c0
--- /dev/null
+++ b/configs/grounding_dino/refcoco/grounding_dino_swin-b_pretrain_zeroshot_refexp.py
@@ -0,0 +1,14 @@
+_base_ = './grounding_dino_swin-t_pretrain_zeroshot_refexp.py'
+
+model = dict(
+    type='GroundingDINO',
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    neck=dict(in_channels=[256, 512, 1024]),
+)
diff --git a/configs/grounding_dino/refcoco/grounding_dino_swin-t_pretrain_zeroshot_refexp.py b/configs/grounding_dino/refcoco/grounding_dino_swin-t_pretrain_zeroshot_refexp.py
new file mode 100644
index 00000000000..4b5c46574a3
--- /dev/null
+++ b/configs/grounding_dino/refcoco/grounding_dino_swin-t_pretrain_zeroshot_refexp.py
@@ -0,0 +1,228 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'
+
+# 30 is an empirical value, just set it to the maximum value
+# without affecting the evaluation result
+model = dict(test_cfg=dict(max_per_img=30))
+
+data_root = 'data/coco/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive'))
+]
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/final_refexp_val.json'
+val_dataset_all_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+val_evaluator_all_val = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco_testA.json'
+val_dataset_refcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testA = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco_testB.json'
+val_dataset_refcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testB = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco+_testA.json'
+val_dataset_refcoco_plus_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_plus_testA = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco+_testB.json'
+val_dataset_refcoco_plus_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_plus_testB = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcocog_test.json'
+val_dataset_refcocog_test = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcocog_test = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_val.json'
+val_dataset_grefcoco_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_grefcoco_val = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_testA.json'
+val_dataset_grefcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_grefcoco_testA = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_testB.json'
+val_dataset_grefcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_grefcoco_testB = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+datasets = [
+    val_dataset_all_val, val_dataset_refcoco_testA, val_dataset_refcoco_testB,
+    val_dataset_refcoco_plus_testA, val_dataset_refcoco_plus_testB,
+    val_dataset_refcocog_test, val_dataset_grefcoco_val,
+    val_dataset_grefcoco_testA, val_dataset_grefcoco_testB
+]
+dataset_prefixes = [
+    'val', 'refcoco_testA', 'refcoco_testB', 'refcoco+_testA',
+    'refcoco+_testB', 'refcocog_test', 'grefcoco_val', 'grefcoco_testA',
+    'grefcoco_testB'
+]
+metrics = [
+    val_evaluator_all_val, val_evaluator_refcoco_testA,
+    val_evaluator_refcoco_testB, val_evaluator_refcoco_plus_testA,
+    val_evaluator_refcoco_plus_testB, val_evaluator_refcocog_test,
+    val_evaluator_grefcoco_val, val_evaluator_grefcoco_testA,
+    val_evaluator_grefcoco_testB
+]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/configs/mm_grounding_dino/README.md b/configs/mm_grounding_dino/README.md
new file mode 100644
index 00000000000..346dd97cd51
--- /dev/null
+++ b/configs/mm_grounding_dino/README.md
@@ -0,0 +1,147 @@
+# Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection
+
+[Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+In this paper, we present an open-set object detector, called Grounding DINO, by marrying Transformer-based detector DINO with grounded pre-training, which can detect arbitrary objects with human inputs such as category names or referring expressions. The key solution of open-set object detection is introducing language to a closed-set detector for open-set concept generalization. To effectively fuse language and vision modalities, we conceptually divide a closed-set detector into three phases and propose a tight fusion solution, which includes a feature enhancer, a language-guided query selection, and a cross-modality decoder for cross-modality fusion. While previous works mainly evaluate open-set object detection on novel categories, we propose to also perform evaluations on referring expression comprehension for objects specified with attributes. Grounding DINO performs remarkably well on all three settings, including benchmarks on COCO, LVIS, ODinW, and RefCOCO/+/g. Grounding DINO achieves a 52.5 AP on the COCO detection zero-shot transfer benchmark, i.e., without any training data from COCO. It sets a new record on the ODinW zero-shot benchmark with a mean 26.1 AP.
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/42299757/0ed51aeb-3d53-42d8-8563-f6d21364ac95"/>
+</div>
+
+## COCO Results and Models
+
+|        Model        | Backbone |   Style   |  COCO mAP  |    Pre-Train Data     |                             Config                             |                                                      Download                                                      |
+| :-----------------: | :------: | :-------: | :--------: | :-------------------: | :------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------: |
+|  Grounding DINO-T   |  Swin-T  | Zero-shot |    46.7    |         O365          |                                                                |                                                                                                                    |
+|  Grounding DINO-T   |  Swin-T  | Zero-shot |    48.1    |      O365,GoldG       |                                                                |                                                                                                                    |
+|  Grounding DINO-T   |  Swin-T  | Zero-shot |    48.4    |   O365,GoldG,Cap4M    | [config](grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/groundingdino_swint_ogc_mmdet-822d7e9d.pth) |
+| Grounding DINO-T-V2 |  Swin-T  | Zero-shot | 48.5(+1.8) |         O365          |                          [config](<>)                          |                                                    [model](<>)                                                     |
+| Grounding DINO-T-V2 |  Swin-T  | Zero-shot | 50.4(+2.3) |      O365,GoldG       |                          [config](<>)                          |                                                    [model](<>)                                                     |
+| Grounding DINO-T-V2 |  Swin-T  | Zero-shot | 50.5(+2.1) |    O365,GoldG,GRIT    |                          [config](<>)                          |                                                    [model](<>)                                                     |
+| Grounding DINO-T-V2 |  Swin-T  | Zero-shot | 50.4(+2.0) | O365,GoldG,GRIT,V3Det |                          [config](<>)                          |                                                    [model](<>)                                                     |
+
+## LVIS Results
+
+|        Model        | MiniVal APr | MiniVal APc | MiniVal APf | MiniVal AP  | Val1.0 APr | Val1.0 APc | Val1.0 APf |  Val1.0 AP  |    Pre-Train Data     |    Config    |  Download   |
+| :-----------------: | :---------: | :---------: | :---------: | :---------: | :--------: | :--------: | :--------: | :---------: | :-------------------: | :----------: | :---------: |
+|  Grounding DINO-T   |    18.8     |    24.2     |    34.7     |    28.8     |    10.1    |    15.3    |    29.9    |    20.1     |   O365,GoldG,Cap4M    | [config](<>) | [model](<>) |
+| Grounding DINO-T-V2 |    28.1     |    30.2     |    42.0     | 35.7(+6.9)  |    17.1    |    22.4    |    36.5    | 27.0(+6.9)  |      O365,GoldG       | [config](<>) | [model](<>) |
+| Grounding DINO-T-V2 |    26.6     |    32.4     |    41.8     | 36.5(+7.7)  |    17.3    |    22.6    |    36.4    | 27.1(+7.0)  |    O365,GoldG,GRIT    | [config](<>) | [model](<>) |
+| Grounding DINO-T-V2 |    34.2     |    37.4     |    46.2     | 41.4(+12.6) |    23.6    |    27.6    |    40.5    | 31.9(+11.8) | O365,GoldG,GRIT,V3Det | [config](<>) | [model](<>) |
+
+## ODinW (Object Detection in the Wild) Results
+
+Learning visual representations from natural language supervision has recently shown great promise in a number of pioneering works. In general, these language-augmented visual models demonstrate strong transferability to a variety of datasets and tasks. However, it remains challenging to evaluate the transferablity of these models due to the lack of easy-to-use evaluation toolkits and public benchmarks. To tackle this, we build ELEVATER 1 , the first benchmark and toolkit for evaluating (pre-trained) language-augmented visual models. ELEVATER is composed of three components. (i) Datasets. As downstream evaluation suites, it consists of 20 image classification datasets and 35 object detection datasets, each of which is augmented with external knowledge. (ii) Toolkit. An automatic hyper-parameter tuning toolkit is developed to facilitate model evaluation on downstream tasks. (iii) Metrics. A variety of evaluation metrics are used to measure sample-efficiency (zero-shot and few-shot) and parameter-efficiency (linear probing and full model fine-tuning). ELEVATER is platform for Computer Vision in the Wild (CVinW), and is publicly released at https://computer-vision-in-the-wild.github.io/ELEVATER/
+
+### Results and models of ODinW13
+
+| Method                | GroundingDINO-T <br/> (O365,GoldG,Cap4M) | GroundingDINO-T-V2 <br/> (O365,GoldG) | GroundingDINO-T-V2 <br/> (O365,GoldG,GRIT) | GroundingDINO-T-V2 <br/> (O365,GoldG,GRIT,V3Det) |
+| --------------------- | ---------------------------------------- | ------------------------------------- | ------------------------------------------ | ------------------------------------------------ |
+| AerialMaritimeDrone   | 0.173                                    | 0.133                                 | 0.155                                      | 0.151                                            |
+| Aquarium              | 0.195                                    | 0.252                                 | 0.261                                      | 0.283                                            |
+| CottontailRabbits     | 0.799                                    | 0.771                                 | 0.810                                      | 0.786                                            |
+| EgoHands              | 0.608                                    | 0.499                                 | 0.537                                      | 0.519                                            |
+| NorthAmericaMushrooms | 0.507                                    | 0.331                                 | 0.462                                      | 0.767                                            |
+| Packages              | 0.687                                    | 0.707                                 | 0.687                                      | 0.706                                            |
+| PascalVOC             | 0.563                                    | 0.565                                 | 0.580                                      | 0.566                                            |
+| pistols               | 0.726                                    | 0.585                                 | 0.709                                      | 0.729                                            |
+| pothole               | 0.215                                    | 0.136                                 | 0.285                                      | 0.243                                            |
+| Raccoon               | 0.549                                    | 0.469                                 | 0.511                                      | 0.535                                            |
+| ShellfishOpenImages   | 0.393                                    | 0.321                                 | 0.437                                      | 0.488                                            |
+| thermalDogsAndPeople  | 0.657                                    | 0.556                                 | 0.603                                      | 0.542                                            |
+| VehiclesOpenImages    | 0.613                                    | 0.566                                 | 0.603                                      | 0.615                                            |
+| Average               | **0.514**                                | **0.453**                             | **0.511**                                  | **0.533**                                        |
+
+### Results and models of ODinW35
+
+| Method                      | GroundingDINO-T <br/> (O365,GoldG,Cap4M) | GroundingDINO-T-V2 <br/> (O365,GoldG) | GroundingDINO-T-V2 <br/> (O365,GoldG,GRIT) | GroundingDINO-T-V2 <br/> (O365,GoldG,GRIT,V3Det) |
+| --------------------------- | ---------------------------------------- | ------------------------------------- | ------------------------------------------ | ------------------------------------------------ |
+| AerialMaritimeDrone_large   | 0.173                                    | 0.133                                 | 0.155                                      | 0.151                                            |
+| AerialMaritimeDrone_tiled   | 0.206                                    | 0.170                                 | 0.225                                      | 0.206                                            |
+| AmericanSignLanguageLetters | 0.002                                    | 0.016                                 | 0.020                                      | 0.007                                            |
+| Aquarium                    | 0.195                                    | 0.252                                 | 0.261                                      | 0.283                                            |
+| BCCD                        | 0.161                                    | 0.069                                 | 0.118                                      | 0.077                                            |
+| boggleBoards                | 0.000                                    | 0.002                                 | 0.001                                      | 0.002                                            |
+| brackishUnderwater          | 0.021                                    | 0.033                                 | 0.021                                      | 0.025                                            |
+| ChessPieces                 | 0.000                                    | 0.000                                 | 0.000                                      | 0.000                                            |
+| CottontailRabbits           | 0.806                                    | 0.771                                 | 0.810                                      | 0.786                                            |
+| dice                        | 0.004                                    | 0.002                                 | 0.005                                      | 0.001                                            |
+| DroneControl                | 0.042                                    | 0.047                                 | 0.097                                      | 0.074                                            |
+| EgoHands_generic            | 0.608                                    | 0.527                                 | 0.537                                      | 0.519                                            |
+| EgoHands_specific           | 0.002                                    | 0.001                                 | 0.005                                      | 0.003                                            |
+| HardHatWorkers              | 0.046                                    | 0.048                                 | 0.070                                      | 0.108                                            |
+| MaskWearing                 | 0.004                                    | 0.009                                 | 0.004                                      | 0.009                                            |
+| MountainDewCommercial       | 0.430                                    | 0.453                                 | 0.465                                      | 0.430                                            |
+| NorthAmericaMushrooms       | 0.471                                    | 0.331                                 | 0.462                                      | 0.767                                            |
+| openPoetryVision            | 0.000                                    | 0.001                                 | 0.000                                      | 0.000                                            |
+| OxfordPets_by_breed         | 0.003                                    | 0.002                                 | 0.004                                      | 0.004                                            |
+| OxfordPets_by_species       | 0.011                                    | 0.019                                 | 0.016                                      | 0.015                                            |
+| PKLot                       | 0.001                                    | 0.004                                 | 0.002                                      | 0.007                                            |
+| Packages                    | 0.695                                    | 0.707                                 | 0.687                                      | 0.706                                            |
+| PascalVOC                   | 0.563                                    | 0.565                                 | 0.580                                      | 0.566                                            |
+| pistols                     | 0.726                                    | 0.585                                 | 0.709                                      | 0.729                                            |
+| plantdoc                    | 0.005                                    | 0.005                                 | 0.007                                      | 0.011                                            |
+| pothole                     | 0.215                                    | 0.136                                 | 0.219                                      | 0.168                                            |
+| Raccoons                    | 0.549                                    | 0.469                                 | 0.511                                      | 0.535                                            |
+| selfdrivingCar              | 0.089                                    | 0.091                                 | 0.076                                      | 0.083                                            |
+| ShellfishOpenImages         | 0.393                                    | 0.321                                 | 0.437                                      | 0.488                                            |
+| ThermalCheetah              | 0.087                                    | 0.063                                 | 0.081                                      | 0.045                                            |
+| thermalDogsAndPeople        | 0.657                                    | 0.556                                 | 0.603                                      | 0.543                                            |
+| UnoCards                    | 0.006                                    | 0.012                                 | 0.010                                      | 0.005                                            |
+| VehiclesOpenImages          | 0.613                                    | 0.566                                 | 0.603                                      | 0.615                                            |
+| WildfireSmoke               | 0.134                                    | 0.106                                 | 0.154                                      | 0.127                                            |
+| websiteScreenshots          | 0.012                                    | 0.02                                  | 0.016                                      | 0.016                                            |
+| Average                     | **0.227**                                | **0.202**                             | **0.228**                                  | **0.284**                                        |
+
+## Referring Expression Comprehension Results
+
+| Method                                  | GroundingDINO-T <br/> (O365,GoldG,Cap4M) | GroundingDINO-T-V2 <br/> (O365,GoldG) | GroundingDINO-T-V2 <br/> (O365,GoldG,GRIT) | GroundingDINO-T-V2 <br/> (O365,GoldG,GRIT,V3Det) |
+| --------------------------------------- | ---------------------------------------- | ------------------------------------- | ------------------------------------------ | ------------------------------------------------ |
+| RefCOCO val @1,5,10                     | 50.77/89.45/94.86                        | 53.06/89.91/94.69                     | 53.4/90.3/95.5                             | 53.1/89.7/95.1                                   |
+| RefCOCO testA @1,5,10                   | 57.45/91.29/95.62                        | 59.70/91.50/95.88                     | 58.8/91.70/96.2                            | 59.1/91.0/95.5                                   |
+| RefCOCO testB @1,5,10                   | 44.97/86.54/92.88                        | 46.38/86.87/92.21                     | 46.8/87.7/93.3                             | 46.8/87.8/93.6                                   |
+| RefCOCO+ val @1,5,10                    | 51.64/86.35/92.57                        | 53.11/87.00/92.79                     | 53.5/88.00/93.7                            | 52.7/87.7/93.5                                   |
+| RefCOCO+ testA @1,5,10                  | 57.25/86.74/92.65                        | 58.94/87.34/92.91                     | 59.0/88.1/93.7                             | 58.7/87.2/93.1                                   |
+| RefCOCO+ testB @1,5,10                  | 46.35/84.05/90.67                        | 47.92/84.31/91.04                     | 47.9/85.5/92.7                             | 48.4/85.8/92.1                                   |
+| RefCOCOg val @1,5,10                    | 60.42/92.10/96.18                        | 61.23/92.61/96.14                     | 62.7/93.3/97.0                             | 62.9/93.3/97.2                                   |
+| RefCOCOg test @1,5,10                   | 59.74/92.08/96.28                        | 61.13/93.26/96.72                     | 62.6/94.9/97.1                             | 62.9/93.9/97.43                                  |
+| gRefCOCO val Pr@(F1=1, IoU≥0.5),N-acc   | 41.32/91.82                              | 39.76/84.65                           | 40.7/89.7                                  | 41.0/91.3                                        |
+| gRefCOCO testA Pr@(F1=1, IoU≥0.5),N-acc | 27.23/90.24                              | 26.25/89.04                           | 26.0/91.9                                  | 26.1/93.0                                        |
+| gRefCOCO testB Pr@(F1=1, IoU≥0.5),N-acc | 29.70/93.49                              | 31.31/84.79                           | 30.6/90.2                                  | 30.4/92.3                                        |
+
+## Description Detection Dataset
+
+```shell
+pip install ddd-dataset
+```
+
+| Method                           | mode     | Grounding DINO-T <br/> (O365,GoldG,Cap4M) | GroundingDINO-T-V2 <br/> (O365,GoldG) | GroundingDINO-T-V2 <br/> (O365,GoldG,GRIT) | GroundingDINO-T-V2 <br/> (O365,GoldG,GRIT,V3Det) |
+| -------------------------------- | -------- | ----------------------------------------- | ------------------------------------- | ------------------------------------------ | ------------------------------------------------ |
+| FULL/short/middle/long/very long | concat   | 17.2/18.0/18.7/14.8/16.3                  | 15.6/17.3/16.7/14.3/13.1              | 17.0/17.7/18.0/15.7/15.7                   | 17.5/23.4/18.3/14.7/13.8                         |
+| FULL/short/middle/long/very long | parallel | 22.3/28.2/24.8/19.1/13.9                  |                                       | 22.5/25.6/25.1/20.5/14.9                   | 22.9/28.1/25.4/20.4/14.4                         |
+| PRES/short/middle/long/very long | concat   | 17.8/18.3/19.2/15.2/17.3                  | 16.4/18.4/17.3/14.5/14.2              | 17.9/19.0/18.3/16.5/17.5                   | 18.0/23.7/18.6/15.4/13.3                         |
+| PRES/short/middle/long/very long | parallel | 21.0/27.0/22.8/17.5/12.5                  |                                       | 21.5/25.2/23.0/19.0/15.0                   | 21.9/27.4/23.2/19.1/14.2                         |
+| ABS/short/middle/long/very long  | concat   | 15.4/17.1/16.4/13.6/14.9                  | 13.4/13.4/14.5/13.5/11.9              | 14.5/13.1/16.7/13.6/13.3                   | 15.9/22.2/17.1/12.5/14.4                         |
+| ABS/short/middle/long/very long  | parallel | 26.0/32.0/33.0/23.6/15.5                  |                                       | 25.6/26.8/33.9/24.5/14.7                   | 26.0/30.3/34.1/23.9/14.6                         |
+
+Note:
+
+1. Considering that the evaluation time for Inter-scenario is very long and the performance is low, it is temporarily not supported. The mentioned metrics are for Intra-scenario.
+2. `concat` is the default inference mode for Grounding DINO, where it concatenates multiple sub-sentences with "." to form a single sentence for inference. On the other hand, "parallel" performs inference on each sub-sentence in a for-loop.
+
+## Flickr30k Results
+
+|        Model        |    Pre-Train Data     | Val R@1 | Val R@5 | Val R@10 | Tesst R@1 | Test R@5 | Test R@10 |                          Config                           |                                                                                                                                                                                                                                         Download                                                                                                                                                                                                                                          |
+| :-----------------: | :-------------------: | ------- | ------- | -------- | --------- | -------- | --------- | :-------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  Grounding DINO-T   |   O365,GoldG,Cap4M    | 87.8    | 96.6    | 98.0     | 88.1      | 96.9     | 98.2      | [config](grounding_dino_swin-t_finetune_16xb2_1x_coco.py) | [model](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco/grounding_dino_swin-t_finetune_16xb2_1x_coco_20230921_152544-5f234b20.pth)                                                                                                \| [log](https://download.openmmlab.com/mmdetection/v3.0/grounding_dino/grounding_dino_swin-t_finetune_16xb2_1x_coco/grounding_dino_swin-t_finetune_16xb2_1x_coco_20230921_152544.log.json) |
+| Grounding DINO-T-V2 |      O365,GoldG       |         |         |          |           |          |           |                                                           |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| Grounding DINO-T-V2 |    O365,GoldG,GRIT    |         |         |          |           |          |           |                                                           |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+| Grounding DINO-T-V2 | O365,GoldG,GRIT,V3Det |         |         |          |           |          |           |                                                           |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           |
+
+Note:
+
+1. `@1,5,10` refers to precision at the top 1, 5, and 10 positions in a predicted ranked list.
diff --git a/configs/mm_grounding_dino/brain_tumor/grounding_dino_swin-t_finetune_8xb4_50e_brain_tumor.py b/configs/mm_grounding_dino/brain_tumor/grounding_dino_swin-t_finetune_8xb4_50e_brain_tumor.py
new file mode 100644
index 00000000000..b0c09f0a9e4
--- /dev/null
+++ b/configs/mm_grounding_dino/brain_tumor/grounding_dino_swin-t_finetune_8xb4_50e_brain_tumor.py
@@ -0,0 +1,110 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+# https://universe.roboflow.com/roboflow-100/brain-tumor-m2pbp/dataset/2
+data_root = 'data/brain_tumor_v2/'
+class_name = ('label0', 'label1', 'label2')
+palette = [(220, 20, 60), (255, 0, 0), (0, 0, 142)]
+
+metainfo = dict(classes=class_name, palette=palette)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+train_dataloader = dict(
+    sampler=dict(_delete_=True, type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=10,
+        dataset=dict(
+            type='CocoDataset',
+            data_root=data_root,
+            metainfo=metainfo,
+            filter_cfg=dict(filter_empty_gt=False, min_size=32),
+            pipeline=train_pipeline,
+            return_classes=True,
+            data_prefix=dict(img='train/'),
+            ann_file='train/_annotations.coco.json')))
+
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        return_classes=True,
+        ann_file='valid/_annotations.coco.json',
+        data_prefix=dict(img='valid/')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'valid/_annotations.coco.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'absolute_pos_embed': dict(decay_mult=0.),
+        'backbone': dict(lr_mult=0.1)
+    }))
+
+# learning policy
+max_epochs = 5
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[4],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = ''
diff --git a/configs/mm_grounding_dino/cityscapes/grounding_dino_swin-t_finetune_8xb4_50e_cityscapes.py b/configs/mm_grounding_dino/cityscapes/grounding_dino_swin-t_finetune_8xb4_50e_cityscapes.py
new file mode 100644
index 00000000000..46b2dbd68fe
--- /dev/null
+++ b/configs/mm_grounding_dino/cityscapes/grounding_dino_swin-t_finetune_8xb4_50e_cityscapes.py
@@ -0,0 +1,110 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/cityscapes/'
+class_name = ('person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+              'bicycle')
+palette = [(220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70), (0, 60, 100),
+           (0, 80, 100), (0, 0, 230), (119, 11, 32)]
+
+metainfo = dict(classes=class_name, palette=palette)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+train_dataloader = dict(
+    sampler=dict(_delete_=True, type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=10,
+        dataset=dict(
+            type='CocoDataset',
+            data_root=data_root,
+            metainfo=metainfo,
+            filter_cfg=dict(filter_empty_gt=False, min_size=32),
+            pipeline=train_pipeline,
+            return_classes=True,
+            data_prefix=dict(img='leftImg8bit/train/'),
+            ann_file='annotations/instancesonly_filtered_gtFine_train.json')))
+
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        return_classes=True,
+        ann_file='annotations/instancesonly_filtered_gtFine_val.json',
+        data_prefix=dict(img='leftImg8bit/val/')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations/instancesonly_filtered_gtFine_val.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'absolute_pos_embed': dict(decay_mult=0.),
+        'backbone': dict(lr_mult=0.1)
+    }))
+
+# learning policy
+max_epochs = 5
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[4],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = ''
diff --git a/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco.py b/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco.py
new file mode 100644
index 00000000000..1253f43470e
--- /dev/null
+++ b/configs/mm_grounding_dino/coco/grounding_dino_swin-t_finetune_16xb4_1x_coco.py
@@ -0,0 +1,85 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='CocoDataset',
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        return_classes=True,
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        pipeline=train_pipeline))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            # 'language_model': dict(lr_mult=0),
+        }))
+
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = ''
diff --git a/configs/mm_grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py b/configs/mm_grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py
new file mode 100644
index 00000000000..e59a0a52518
--- /dev/null
+++ b/configs/mm_grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py
@@ -0,0 +1,78 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/d3/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities', 'sent_ids'))
+]
+
+# -------------------------------------------------#
+val_dataset_full = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_full_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+
+val_evaluator_full = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_full_annotations.json')
+
+# -------------------------------------------------#
+val_dataset_pres = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_pres_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+val_evaluator_pres = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_pres_annotations.json')
+
+# -------------------------------------------------#
+val_dataset_abs = dict(
+    type='DODDataset',
+    data_root=data_root,
+    ann_file='d3_json/d3_abs_annotations.json',
+    data_prefix=dict(img='d3_images/', anno='d3_pkl'),
+    pipeline=test_pipeline,
+    test_mode=True,
+    backend_args=None,
+    return_classes=True)
+val_evaluator_abs = dict(
+    type='DODCocoMetric',
+    ann_file=data_root + 'd3_json/d3_abs_annotations.json')
+
+# -------------------------------------------------#
+datasets = [val_dataset_full, val_dataset_pres, val_dataset_abs]
+dataset_prefixes = ['FULL', 'PRES', 'ABS']
+metrics = [val_evaluator_full, val_evaluator_pres, val_evaluator_abs]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/configs/mm_grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_parallel_dod.py b/configs/mm_grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_parallel_dod.py
new file mode 100644
index 00000000000..3d680091162
--- /dev/null
+++ b/configs/mm_grounding_dino/dod/grounding_dino_swin-t_pretrain_zeroshot_parallel_dod.py
@@ -0,0 +1,3 @@
+_base_ = 'grounding_dino_swin-t_pretrain_zeroshot_concat_dod.py'
+
+model = dict(test_cfg=dict(chunked_size=1))
diff --git a/configs/mm_grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py b/configs/mm_grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py
new file mode 100644
index 00000000000..b0c94e31f2b
--- /dev/null
+++ b/configs/mm_grounding_dino/flickr30k/grounding_dino_swin-t-pretrain_zeroshot_flickr30k.py
@@ -0,0 +1,57 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+dataset_type = 'Flickr30kDataset'
+data_root = 'data/flickr30k/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive', 'phrase_ids', 'phrases'))
+]
+
+dataset_Flickr30k_val = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='mdetr_annotations/final_flickr_separateGT_val.json',
+    data_prefix=dict(img='flickr30k_images/'),
+    pipeline=test_pipeline,
+)
+
+dataset_Flickr30k_test = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='mdetr_annotations/final_flickr_separateGT_test.json',
+    data_prefix=dict(img='flickr30k_images/'),
+    pipeline=test_pipeline,
+)
+
+val_evaluator_Flickr30k = dict(type='Flickr30kMetric')
+
+test_evaluator_Flickr30k = dict(type='Flickr30kMetric')
+
+# ----------Config---------- #
+dataset_prefixes = ['Flickr30kVal', 'Flickr30kTest']
+datasets = [dataset_Flickr30k_val, dataset_Flickr30k_test]
+metrics = [val_evaluator_Flickr30k, test_evaluator_Flickr30k]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/configs/mm_grounding_dino/grounding_dino_swin-b_pretrain_pl.py b/configs/mm_grounding_dino/grounding_dino_swin-b_pretrain_pl.py
new file mode 100644
index 00000000000..31591e55643
--- /dev/null
+++ b/configs/mm_grounding_dino/grounding_dino_swin-b_pretrain_pl.py
@@ -0,0 +1,42 @@
+_base_ = '../grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py'
+
+model = dict(test_cfg=dict(max_per_img=10))
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadTextAnnotations'),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive'))
+]
+
+data_root = 'data/'
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=False,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root=data_root,
+        ann_file='final_flickr_separateGT_train_vg.json',
+        data_prefix=dict(img='flickr30k_images/'),
+        pipeline=test_pipeline,
+        return_classes=True))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    outfile_path='aa.json',
+    img_prefix=data_root + 'flickr30k_images/',
+    type='DumpODVGResults')
+test_evaluator = val_evaluator
diff --git a/configs/mm_grounding_dino/grounding_dino_swin-l_pretrain_all.py b/configs/mm_grounding_dino/grounding_dino_swin-l_pretrain_all.py
new file mode 100644
index 00000000000..8523eb6b01e
--- /dev/null
+++ b/configs/mm_grounding_dino/grounding_dino_swin-l_pretrain_all.py
@@ -0,0 +1,473 @@
+_base_ = 'grounding_dino_swin-t_pretrain_obj365.py'
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'  # noqa
+num_levels = 5
+model = dict(
+    num_feature_levels=num_levels,
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        # Please only add indices that would be used
+        # in FPN, otherwise some parameter will not be used
+        with_cp=True,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels),
+    encoder=dict(layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))),
+    decoder=dict(layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels))))
+
+# --------------------------- object365v2 od dataset---------------------------
+objv2_backend_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/objects365v2/': 'yudong:s3://wangyudong/obj365_v2/',
+        'data/objects365v2/': 'yudong:s3://wangyudong/obj365_v2/'
+    }))
+
+objv2_train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=objv2_backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        # change this
+        label_map_file='data/objects365v2/annotations/o365v2_label_map.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+o365v2_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/objects365v2/',
+    ann_file='annotations/zhiyuan_objv2_train_od.json',
+    label_map_file='annotations/o365v2_label_map.json',
+    data_prefix=dict(img='train/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=objv2_train_pipeline,
+    return_classes=True,
+    need_text=False,  # change this
+    backend_args=None,
+)
+
+# --------------------------- openimagev6 od dataset---------------------------
+oi_backend_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/': 's3://openmmlab/datasets/detection/',
+        'data/': 's3://openmmlab/datasets/detection/'
+    }))
+
+oi_train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=oi_backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        # change this
+        label_map_file='data/OpenImages/annotations/openimages_label_map.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+oiv6_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/OpenImages/',
+    ann_file='annotations/oidv6-train-annotations-vg.jsonl',
+    label_map_file='annotations/openimages_label_map.json',
+    data_prefix=dict(img='OpenImages/train/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    need_text=False,  # change this
+    pipeline=oi_train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+# --------------------------- v3det od dataset---------------------------
+v3d_train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        # change this
+        label_map_file='data/V3Det/annotations/v3det_2023_v1_label_map.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+v3det_dataset = dict(
+    type='RepeatDataset',
+    times=2,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/V3Det/',
+        ann_file='annotations/v3det_2023_v1_train_od.json',
+        label_map_file='annotations/v3det_2023_v1_label_map.json',
+        data_prefix=dict(img=''),
+        filter_cfg=dict(filter_empty_gt=False),
+        need_text=False,  # change this
+        pipeline=v3d_train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- coco2017 od dataset---------------------------
+coco2017_train_dataset = dict(
+    type='RepeatDataset',
+    times=2,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/coco/',
+        ann_file='instance_train2017_norefval_od.json',
+        label_map_file='coco2017_label_map.json',
+        data_prefix=dict(img='train2017'),
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=_base_.train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- flickr30k vg dataset---------------------------
+flickr30k_dataset = dict(
+    type='RepeatDataset',
+    times=2,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/flickr30k_entities/',
+        ann_file='final_flickr_separateGT_train_vg.json',
+        label_map_file=None,
+        data_prefix=dict(img='flickr30k_images/'),
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=_base_.train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- gqa vg dataset---------------------------
+gqa_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/gqa/',
+    ann_file='final_mixed_train_no_coco_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+# --------------------------- coco2014 vg dataset---------------------------
+coco2014_vg_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/coco/',
+    ann_file='mdetr_annotations/final_mixed_train_only_coco_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='train2014/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+# --------------------------- refcoco vg dataset---------------------------
+refcoco_dataset = dict(
+    type='RepeatDataset',
+    times=2,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/coco/',
+        ann_file='mdetr_annotations/finetune_refcoco_train_vg.json',
+        label_map_file=None,
+        data_prefix=dict(img='train2014'),
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=_base_.train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- refcoco+ vg dataset---------------------------
+refcoco_plus_dataset = dict(
+    type='RepeatDataset',
+    times=2,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/coco/',
+        ann_file='mdetr_annotations/finetune_refcoco+_train_vg.json',
+        label_map_file=None,
+        data_prefix=dict(img='train2014'),
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=_base_.train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- refcocog vg dataset---------------------------
+refcocog_dataset = dict(
+    type='RepeatDataset',
+    times=3,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/coco/',
+        ann_file='mdetr_annotations/finetune_refcocog_train_vg.json',
+        label_map_file=None,
+        data_prefix=dict(img='train2014'),
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=_base_.train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- grefcoco vg dataset---------------------------
+grefcoco_dataset = dict(
+    type='RepeatDataset',
+    times=2,
+    dataset=dict(
+        type='ODVGDataset',
+        data_root='data/coco/',
+        ann_file='mdetr_annotations/finetune_grefcoco_train_vg.json',
+        label_map_file=None,
+        data_prefix=dict(img='train2014'),
+        filter_cfg=dict(filter_empty_gt=False),
+        pipeline=_base_.train_pipeline,
+        return_classes=True,
+        backend_args=None))
+
+# --------------------------- grit vg dataset---------------------------
+grit_backend_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/grit/': 'yichen:s3://chenyicheng/grit/',
+        'data/grit/': 'yichen:s3://chenyicheng/grit/'
+    }))
+
+grit_train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=grit_backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+grit_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/grit/',
+    ann_file='grit20m_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img=''),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=grit_train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+# --------------------------- dataloader---------------------------
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    sampler=dict(
+        _delete_=True,
+        type='CustomSampleSizeSampler',
+        ratio_mode=True,
+        # OD ~ 1.74+1.67*0.5+0.18*2+0.12*2=3.175
+        # vg ~ 0.15*2+0.62*1+0.49*1+0.12*2+0.12*2+0.08*3+0.19*2+9*0.09=3.32
+        dataset_size=[-1, 0.5, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0.09]),
+    dataset=dict(datasets=[
+        o365v2_dataset,  # 1.74M
+        oiv6_dataset,  # 1.67M
+        v3det_dataset,  # 0.18M
+        coco2017_train_dataset,  # 0.12M
+        flickr30k_dataset,  # 0.15M
+        gqa_dataset,  # 0.62M
+        coco2014_vg_dataset,  # 0.49M
+        refcoco_dataset,  # 0.12M
+        refcoco_plus_dataset,  # 0.12M
+        refcocog_dataset,  # 0.08M
+        grefcoco_dataset,  # 0.19M
+        grit_dataset  # 9M
+    ]))
+
+# bs=256
+optim_wrapper = dict(optimizer=dict(lr=0.0008))
+
+# one epoch = (3.175+3.32)M/256 = 25371 iter
+# 24e=608904 iter
+# 16e=405936 iter
+# 20e=507420 iter
+max_iter = 608904
+train_cfg = dict(
+    _delete_=True,
+    type='IterBasedTrainLoop',
+    max_iters=max_iter,
+    val_interval=13000)
+
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[405936, 507420],
+        gamma=0.1)
+]
+
+default_hooks = dict(
+    checkpoint=dict(by_epoch=False, interval=13000, max_keep_ckpts=30))
+log_processor = dict(by_epoch=False)
diff --git a/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py b/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py
new file mode 100644
index 00000000000..782487434fe
--- /dev/null
+++ b/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365.py
@@ -0,0 +1,245 @@
+_base_ = [
+    '../_base_/datasets/coco_detection.py',
+    '../_base_/schedules/schedule_1x.py', '../_base_/default_runtime.py'
+]
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth'  # noqa
+lang_model_name = 'bert-base-uncased'
+
+model = dict(
+    type='GroundingDINO',
+    num_queries=900,
+    with_box_refine=True,
+    as_two_stage=True,
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=False,
+    ),
+    language_model=dict(
+        type='BertModel',
+        name=lang_model_name,
+        max_tokens=256,
+        pad_to_max=False,
+        use_sub_sentence_represent=True,
+        special_tokens_list=['[CLS]', '[SEP]', '.', '?'],
+        add_pooling_layer=False,
+    ),
+    backbone=dict(
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2,
+        patch_norm=True,
+        out_indices=(1, 2, 3),
+        with_cp=True,
+        convert_weights=True,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    neck=dict(
+        type='ChannelMapper',
+        in_channels=[192, 384, 768],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        bias=True,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    encoder=dict(
+        num_layers=6,
+        num_cp=6,
+        # visual layer config
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_levels=4, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
+        # text layer config
+        text_layer_cfg=dict(
+            self_attn_cfg=dict(num_heads=4, embed_dims=256, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=1024, ffn_drop=0.0)),
+        # fusion layer config
+        fusion_layer_cfg=dict(
+            v_dim=256,
+            l_dim=256,
+            embed_dim=1024,
+            num_heads=4,
+            init_values=1e-4),
+    ),
+    decoder=dict(
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(
+            # query self attention layer
+            self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            # cross attention layer query to text
+            cross_attn_text_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            # cross attention layer query to image
+            cross_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)),
+        post_norm_cfg=None),
+    positional_encoding=dict(
+        num_feats=128, normalize=True, offset=0.0, temperature=20),
+    bbox_head=dict(
+        type='GroundingDINOHead',
+        num_classes=256,
+        sync_cls_avg_factor=True,
+        contrastive_cfg=dict(max_text_len=256, log_scale='auto', bias=True),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),  # 2.0 in DeformDETR
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0)),
+    dn_cfg=dict(  # TODO: Move to model.train_cfg ?
+        label_noise_scale=0.5,
+        box_noise_scale=1.0,  # 0.4 for DN-DETR
+        group_cfg=dict(dynamic=True, num_groups=None,
+                       num_dn_queries=100)),  # TODO: half num_dn_queries
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            match_costs=[
+                dict(type='BinaryFocalLossCost', weight=2.0),
+                dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                dict(type='IoUCost', iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=300))
+
+# dataset settings
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=lang_model_name,
+        num_sample_negative=85,
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive'))
+]
+
+dataset_type = 'ODVGDataset'
+data_root = 'data/objects365v1/'
+
+coco_od_dataset = dict(
+    type=dataset_type,
+    data_root=data_root,
+    ann_file='o365v1_train_odvg.jsonl',
+    label_map_file='o365v1_label_map.json',
+    data_prefix=dict(img='train/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+train_dataloader = dict(
+    _delete_=True,
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(type='ConcatDataset', datasets=[coco_od_dataset]))
+
+val_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, return_classes=True))
+test_dataloader = val_dataloader
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0004,
+                   weight_decay=0.0001),  # bs=16 0.0001
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            'language_model': dict(lr_mult=0.1),
+        }))
+
+# learning policy
+max_epochs = 30
+param_scheduler = [
+    dict(type='LinearLR', start_factor=0.1, by_epoch=False, begin=0, end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[19, 26],
+        gamma=0.1)
+]
+
+train_cfg = dict(
+    type='EpochBasedTrainLoop', max_epochs=max_epochs, val_interval=1)
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg.py b/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg.py
new file mode 100644
index 00000000000..a86abd7997e
--- /dev/null
+++ b/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg.py
@@ -0,0 +1,38 @@
+_base_ = 'grounding_dino_swin-t_pretrain_obj365.py'
+
+o365v1_od_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/objects365v1/',
+    ann_file='o365v1_train_odvg.jsonl',
+    label_map_file='o365v1_label_map.json',
+    data_prefix=dict(img='train/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None,
+)
+
+flickr30k_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/flickr30k_entities/',
+    ann_file='final_flickr_separateGT_train_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='flickr30k_images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+gqa_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/gqa/',
+    ann_file='final_mixed_train_no_coco_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+train_dataloader = dict(
+    dataset=dict(datasets=[o365v1_od_dataset, flickr30k_dataset, gqa_dataset]))
diff --git a/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m.py b/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m.py
new file mode 100644
index 00000000000..1cd659f063e
--- /dev/null
+++ b/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m.py
@@ -0,0 +1,55 @@
+_base_ = 'grounding_dino_swin-t_pretrain_obj365.py'
+
+o365v1_od_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/objects365v1/',
+    ann_file='o365v1_train_odvg.jsonl',
+    label_map_file='o365v1_label_map.json',
+    data_prefix=dict(img='train/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None,
+)
+
+flickr30k_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/flickr30k_entities/',
+    ann_file='final_flickr_separateGT_train_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='flickr30k_images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+gqa_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/gqa/',
+    ann_file='final_mixed_train_no_coco_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+grit_dataset = dict(
+    type='ODVGDataset',
+    data_root='grit_processed/',
+    ann_file='grit20m_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img=''),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+train_dataloader = dict(
+    sampler=dict(
+        _delete_=True,
+        type='CustomSampleSizeSampler',
+        dataset_size=[-1, -1, -1, 500000]),
+    dataset=dict(datasets=[
+        o365v1_od_dataset, flickr30k_dataset, gqa_dataset, grit_dataset
+    ]))
diff --git a/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det.py b/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det.py
new file mode 100644
index 00000000000..5a7d3b58947
--- /dev/null
+++ b/configs/mm_grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_grit9m_v3det.py
@@ -0,0 +1,117 @@
+_base_ = 'grounding_dino_swin-t_pretrain_obj365.py'
+
+o365v1_od_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/objects365v1/',
+    ann_file='o365v1_train_odvg.jsonl',
+    label_map_file='o365v1_label_map.json',
+    data_prefix=dict(img='train/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None,
+)
+
+flickr30k_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/flickr30k_entities/',
+    ann_file='final_flickr_separateGT_train_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='flickr30k_images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+gqa_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/gqa/',
+    ann_file='final_mixed_train_no_coco_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img='images/'),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+v3d_train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        # change this
+        label_map_file='data/V3Det/annotations/v3det_2023_v1_label_map.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+v3det_dataset = dict(
+    type='ODVGDataset',
+    data_root='data/V3Det/',
+    ann_file='annotations/v3det_2023_v1_train_od.json',
+    label_map_file='annotations/v3det_2023_v1_label_map.json',
+    data_prefix=dict(img=''),
+    filter_cfg=dict(filter_empty_gt=False),
+    need_text=False,  # change this
+    pipeline=v3d_train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+grit_dataset = dict(
+    type='ODVGDataset',
+    data_root='grit_processed/',
+    ann_file='grit20m_vg.json',
+    label_map_file=None,
+    data_prefix=dict(img=''),
+    filter_cfg=dict(filter_empty_gt=False),
+    pipeline=_base_.train_pipeline,
+    return_classes=True,
+    backend_args=None)
+
+train_dataloader = dict(
+    sampler=dict(
+        _delete_=True,
+        type='CustomSampleSizeSampler',
+        dataset_size=[-1, -1, -1, -1, 500000]),
+    dataset=dict(datasets=[
+        o365v1_od_dataset, flickr30k_dataset, gqa_dataset, v3det_dataset,
+        grit_dataset
+    ]))
diff --git a/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_finetune_16xb4_1x_lvis.py b/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_finetune_16xb4_1x_lvis.py
new file mode 100644
index 00000000000..3ca34c88509
--- /dev/null
+++ b/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_finetune_16xb4_1x_lvis.py
@@ -0,0 +1,120 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        # change this
+        label_map_file='data/coco/annotations/lvis_v1_label_map.json',
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='ClassBalancedDataset',
+        oversample_thr=1e-3,
+        dataset=dict(
+            type='ODVGDataset',
+            data_root=data_root,
+            need_text=False,
+            label_map_file='annotations/lvis_v1_label_map.json',
+            ann_file='annotations/lvis_v1_train_od.json',
+            data_prefix=dict(img=''),
+            filter_cfg=dict(filter_empty_gt=False, min_size=32),
+            return_classes=True,
+            pipeline=train_pipeline)))
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type='LVISV1Dataset',
+        ann_file='annotations/lvis_v1_minival_inserted_image_name.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root +
+    'annotations/lvis_v1_minival_inserted_image_name.json')
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            # 'language_model': dict(lr_mult=0),
+        }))
+
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=3)
+
+default_hooks = dict(
+    checkpoint=dict(
+        max_keep_ckpts=1, save_best='lvis_fixed_ap/AP', rule='greater'))
+
+load_from = ''
diff --git a/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_lvis.py b/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_lvis.py
new file mode 100644
index 00000000000..fb4ed438e0b
--- /dev/null
+++ b/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_lvis.py
@@ -0,0 +1,24 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/coco/'
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type=dataset_type,
+        ann_file='annotations/lvis_od_val.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+# numpy < 1.24.0
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root + 'annotations/lvis_od_val.json')
+test_evaluator = val_evaluator
diff --git a/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py b/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py
new file mode 100644
index 00000000000..406a39a4264
--- /dev/null
+++ b/configs/mm_grounding_dino/lvis/grounding_dino_swin-t_pretrain_zeroshot_mini-lvis.py
@@ -0,0 +1,25 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+model = dict(test_cfg=dict(
+    max_per_img=300,
+    chunked_size=40,
+))
+
+dataset_type = 'LVISV1Dataset'
+data_root = 'data/coco/'
+
+val_dataloader = dict(
+    dataset=dict(
+        data_root=data_root,
+        type=dataset_type,
+        ann_file='annotations/lvis_v1_minival_inserted_image_name.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+# numpy < 1.24.0
+val_evaluator = dict(
+    _delete_=True,
+    type='LVISFixedAPMetric',
+    ann_file=data_root +
+    'annotations/lvis_v1_minival_inserted_image_name.json')
+test_evaluator = val_evaluator
diff --git a/configs/mm_grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py b/configs/mm_grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py
new file mode 100644
index 00000000000..d87ca7ca1ea
--- /dev/null
+++ b/configs/mm_grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw13.py
@@ -0,0 +1,338 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'  # noqa
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    test_mode=True,
+    pipeline=base_test_pipeline,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'penguin': {
+#         'suffix': ', which is black and white'
+#     },
+#     'puffin': {
+#         'suffix': ' with orange beaks'
+#     },
+#     'stingray': {
+#         'suffix': ' which is flat and round'
+#     },
+# }
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 CottontailRabbits---------------------#
+class_name = ('Cottontail-Rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+
+# caption_prompt = None
+caption_prompt = {'Cottontail-Rabbit': {'name': 'rabbit'}}
+
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 EgoHands---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+
+# caption_prompt = None
+caption_prompt = {'hand': {'suffix': ' of a person'}}
+
+dataset_EgoHands = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 NorthAmericaMushrooms---------------------#
+class_name = ('CoW', 'chanterelle')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+
+# caption_prompt = None
+caption_prompt = {
+    'CoW': {
+        'name': 'flat mushroom'
+    },
+    'chanterelle': {
+        'name': 'yellow mushroom'
+    }
+}
+
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+
+# caption_prompt = None
+caption_prompt = {
+    'package': {
+        'prefix': 'there is a ',
+        'suffix': ' on the porch'
+    }
+}
+
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+
+# caption_prompt = None
+caption_prompt = {
+    'pothole': {
+        'prefix': 'there are some ',
+        'name': 'holes',
+        'suffix': ' on the road'
+    }
+}
+
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+dataset_prefixes = [
+    'AerialMaritimeDrone', 'Aquarium', 'CottontailRabbits', 'EgoHands',
+    'NorthAmericaMushrooms', 'Packages', 'PascalVOC', 'pistols', 'pothole',
+    'Raccoon', 'ShellfishOpenImages', 'thermalDogsAndPeople',
+    'VehiclesOpenImages'
+]
+datasets = [
+    dataset_AerialMaritimeDrone, dataset_Aquarium, dataset_CottontailRabbits,
+    dataset_EgoHands, dataset_NorthAmericaMushrooms, dataset_Packages,
+    dataset_PascalVOC, dataset_pistols, dataset_pothole, dataset_Raccoon,
+    dataset_ShellfishOpenImages, dataset_thermalDogsAndPeople,
+    dataset_VehiclesOpenImages
+]
+metrics = [
+    val_evaluator_AerialMaritimeDrone, val_evaluator_Aquarium,
+    val_evaluator_CottontailRabbits, val_evaluator_EgoHands,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_Packages,
+    val_evaluator_PascalVOC, val_evaluator_pistols, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_ShellfishOpenImages,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_VehiclesOpenImages
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/configs/mm_grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw35.py b/configs/mm_grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw35.py
new file mode 100644
index 00000000000..a6b8566aed4
--- /dev/null
+++ b/configs/mm_grounding_dino/odinw/grounding_dino_swin-t_pretrain_odinw35.py
@@ -0,0 +1,794 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'  # noqa
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone_large---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone_large = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_large = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 AerialMaritimeDrone_tiled---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/tiled/'
+dataset_AerialMaritimeDrone_tiled = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_tiled = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 AmericanSignLanguageLetters---------------------#
+class_name = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+              'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AmericanSignLanguageLetters/American Sign Language Letters.v1-v1.coco/'  # noqa
+dataset_AmericanSignLanguageLetters = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AmericanSignLanguageLetters = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 BCCD---------------------#
+class_name = ('Platelets', 'RBC', 'WBC')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'BCCD/BCCD.v3-raw.coco/'
+dataset_BCCD = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_BCCD = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 boggleBoards---------------------#
+class_name = ('Q', 'a', 'an', 'b', 'c', 'd', 'e', 'er', 'f', 'g', 'h', 'he',
+              'i', 'in', 'j', 'k', 'l', 'm', 'n', 'o', 'o ', 'p', 'q', 'qu',
+              'r', 's', 't', 't\\', 'th', 'u', 'v', 'w', 'wild', 'x', 'y', 'z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'boggleBoards/416x416AutoOrient/export/'
+dataset_boggleBoards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_boggleBoards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 brackishUnderwater---------------------#
+class_name = ('crab', 'fish', 'jellyfish', 'shrimp', 'small_fish', 'starfish')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'brackishUnderwater/960x540/'
+dataset_brackishUnderwater = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_brackishUnderwater = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 ChessPieces---------------------#
+class_name = ('  ', 'black bishop', 'black king', 'black knight', 'black pawn',
+              'black queen', 'black rook', 'white bishop', 'white king',
+              'white knight', 'white pawn', 'white queen', 'white rook')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ChessPieces/Chess Pieces.v23-raw.coco/'
+dataset_ChessPieces = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ChessPieces = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 CottontailRabbits---------------------#
+class_name = ('rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 dice---------------------#
+class_name = ('1', '2', '3', '4', '5', '6')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'dice/mediumColor/export/'
+dataset_dice = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_dice = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 DroneControl---------------------#
+class_name = ('follow', 'follow_hand', 'land', 'land_hand', 'null', 'object',
+              'takeoff', 'takeoff-hand')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'DroneControl/Drone Control.v3-raw.coco/'
+dataset_DroneControl = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_DroneControl = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 EgoHands_generic---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+caption_prompt = {'hand': {'suffix': ' of a person'}}
+dataset_EgoHands_generic = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_generic = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 EgoHands_specific---------------------#
+class_name = ('myleft', 'myright', 'yourleft', 'yourright')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/specific/'
+dataset_EgoHands_specific = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_specific = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------14 HardHatWorkers---------------------#
+class_name = ('head', 'helmet', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'HardHatWorkers/raw/'
+dataset_HardHatWorkers = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_HardHatWorkers = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------15 MaskWearing---------------------#
+class_name = ('mask', 'no-mask')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MaskWearing/raw/'
+dataset_MaskWearing = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MaskWearing = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------16 MountainDewCommercial---------------------#
+class_name = ('bottle', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MountainDewCommercial/'
+dataset_MountainDewCommercial = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MountainDewCommercial = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------17 NorthAmericaMushrooms---------------------#
+class_name = ('flat mushroom', 'yellow mushroom')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------18 openPoetryVision---------------------#
+class_name = ('American Typewriter', 'Andale Mono', 'Apple Chancery', 'Arial',
+              'Avenir', 'Baskerville', 'Big Caslon', 'Bradley Hand',
+              'Brush Script MT', 'Chalkboard', 'Comic Sans MS', 'Copperplate',
+              'Courier', 'Didot', 'Futura', 'Geneva', 'Georgia', 'Gill Sans',
+              'Helvetica', 'Herculanum', 'Impact', 'Kefa', 'Lucida Grande',
+              'Luminari', 'Marker Felt', 'Menlo', 'Monaco', 'Noteworthy',
+              'Optima', 'PT Sans', 'PT Serif', 'Palatino', 'Papyrus',
+              'Phosphate', 'Rockwell', 'SF Pro', 'SignPainter', 'Skia',
+              'Snell Roundhand', 'Tahoma', 'Times New Roman', 'Trebuchet MS',
+              'Verdana')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'openPoetryVision/512x512/'
+dataset_openPoetryVision = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_openPoetryVision = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------19 OxfordPets_by_breed---------------------#
+class_name = ('cat-Abyssinian', 'cat-Bengal', 'cat-Birman', 'cat-Bombay',
+              'cat-British_Shorthair', 'cat-Egyptian_Mau', 'cat-Maine_Coon',
+              'cat-Persian', 'cat-Ragdoll', 'cat-Russian_Blue', 'cat-Siamese',
+              'cat-Sphynx', 'dog-american_bulldog',
+              'dog-american_pit_bull_terrier', 'dog-basset_hound',
+              'dog-beagle', 'dog-boxer', 'dog-chihuahua',
+              'dog-english_cocker_spaniel', 'dog-english_setter',
+              'dog-german_shorthaired', 'dog-great_pyrenees', 'dog-havanese',
+              'dog-japanese_chin', 'dog-keeshond', 'dog-leonberger',
+              'dog-miniature_pinscher', 'dog-newfoundland', 'dog-pomeranian',
+              'dog-pug', 'dog-saint_bernard', 'dog-samoyed',
+              'dog-scottish_terrier', 'dog-shiba_inu',
+              'dog-staffordshire_bull_terrier', 'dog-wheaten_terrier',
+              'dog-yorkshire_terrier')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-breed/'  # noqa
+dataset_OxfordPets_by_breed = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_breed = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------20 OxfordPets_by_species---------------------#
+class_name = ('cat', 'dog')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-species/'  # noqa
+dataset_OxfordPets_by_species = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_species = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------21 PKLot---------------------#
+class_name = ('space-empty', 'space-occupied')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PKLot/640/'  # noqa
+dataset_PKLot = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PKLot = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------22 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+caption_prompt = {
+    'package': {
+        'prefix': 'there is a ',
+        'suffix': ' on the porch'
+    }
+}
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------23 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------24 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------25 plantdoc---------------------#
+class_name = ('Apple Scab Leaf', 'Apple leaf', 'Apple rust leaf',
+              'Bell_pepper leaf', 'Bell_pepper leaf spot', 'Blueberry leaf',
+              'Cherry leaf', 'Corn Gray leaf spot', 'Corn leaf blight',
+              'Corn rust leaf', 'Peach leaf', 'Potato leaf',
+              'Potato leaf early blight', 'Potato leaf late blight',
+              'Raspberry leaf', 'Soyabean leaf', 'Soybean leaf',
+              'Squash Powdery mildew leaf', 'Strawberry leaf',
+              'Tomato Early blight leaf', 'Tomato Septoria leaf spot',
+              'Tomato leaf', 'Tomato leaf bacterial spot',
+              'Tomato leaf late blight', 'Tomato leaf mosaic virus',
+              'Tomato leaf yellow virus', 'Tomato mold leaf',
+              'Tomato two spotted spider mites leaf', 'grape leaf',
+              'grape leaf black rot')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'plantdoc/416x416/'
+dataset_plantdoc = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_plantdoc = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------26 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+caption_prompt = {
+    'pothole': {
+        'name': 'holes',
+        'prefix': 'there are some ',
+        'suffix': ' on the road'
+    }
+}
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    caption_prompt=caption_prompt,
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------27 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------28 selfdrivingCar---------------------#
+class_name = ('biker', 'car', 'pedestrian', 'trafficLight',
+              'trafficLight-Green', 'trafficLight-GreenLeft',
+              'trafficLight-Red', 'trafficLight-RedLeft',
+              'trafficLight-Yellow', 'trafficLight-YellowLeft', 'truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'selfdrivingCar/fixedLarge/export/'
+dataset_selfdrivingCar = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_selfdrivingCar = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------29 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------30 ThermalCheetah---------------------#
+class_name = ('cheetah', 'human')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ThermalCheetah/'
+dataset_ThermalCheetah = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ThermalCheetah = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------31 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------32 UnoCards---------------------#
+class_name = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
+              '12', '13', '14')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'UnoCards/raw/'
+dataset_UnoCards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_UnoCards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------33 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------34 WildfireSmoke---------------------#
+class_name = ('smoke', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'WildfireSmoke/'
+dataset_WildfireSmoke = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_WildfireSmoke = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------35 websiteScreenshots---------------------#
+class_name = ('button', 'field', 'heading', 'iframe', 'image', 'label', 'link',
+              'text')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'websiteScreenshots/'
+dataset_websiteScreenshots = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_websiteScreenshots = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+
+dataset_prefixes = [
+    'AerialMaritimeDrone_large',
+    'AerialMaritimeDrone_tiled',
+    'AmericanSignLanguageLetters',
+    'Aquarium',
+    'BCCD',
+    'boggleBoards',
+    'brackishUnderwater',
+    'ChessPieces',
+    'CottontailRabbits',
+    'dice',
+    'DroneControl',
+    'EgoHands_generic',
+    'EgoHands_specific',
+    'HardHatWorkers',
+    'MaskWearing',
+    'MountainDewCommercial',
+    'NorthAmericaMushrooms',
+    'openPoetryVision',
+    'OxfordPets_by_breed',
+    'OxfordPets_by_species',
+    'PKLot',
+    'Packages',
+    'PascalVOC',
+    'pistols',
+    'plantdoc',
+    'pothole',
+    'Raccoons',
+    'selfdrivingCar',
+    'ShellfishOpenImages',
+    'ThermalCheetah',
+    'thermalDogsAndPeople',
+    'UnoCards',
+    'VehiclesOpenImages',
+    'WildfireSmoke',
+    'websiteScreenshots',
+]
+
+datasets = [
+    dataset_AerialMaritimeDrone_large, dataset_AerialMaritimeDrone_tiled,
+    dataset_AmericanSignLanguageLetters, dataset_Aquarium, dataset_BCCD,
+    dataset_boggleBoards, dataset_brackishUnderwater, dataset_ChessPieces,
+    dataset_CottontailRabbits, dataset_dice, dataset_DroneControl,
+    dataset_EgoHands_generic, dataset_EgoHands_specific,
+    dataset_HardHatWorkers, dataset_MaskWearing, dataset_MountainDewCommercial,
+    dataset_NorthAmericaMushrooms, dataset_openPoetryVision,
+    dataset_OxfordPets_by_breed, dataset_OxfordPets_by_species, dataset_PKLot,
+    dataset_Packages, dataset_PascalVOC, dataset_pistols, dataset_plantdoc,
+    dataset_pothole, dataset_Raccoon, dataset_selfdrivingCar,
+    dataset_ShellfishOpenImages, dataset_ThermalCheetah,
+    dataset_thermalDogsAndPeople, dataset_UnoCards, dataset_VehiclesOpenImages,
+    dataset_WildfireSmoke, dataset_websiteScreenshots
+]
+
+metrics = [
+    val_evaluator_AerialMaritimeDrone_large,
+    val_evaluator_AerialMaritimeDrone_tiled,
+    val_evaluator_AmericanSignLanguageLetters, val_evaluator_Aquarium,
+    val_evaluator_BCCD, val_evaluator_boggleBoards,
+    val_evaluator_brackishUnderwater, val_evaluator_ChessPieces,
+    val_evaluator_CottontailRabbits, val_evaluator_dice,
+    val_evaluator_DroneControl, val_evaluator_EgoHands_generic,
+    val_evaluator_EgoHands_specific, val_evaluator_HardHatWorkers,
+    val_evaluator_MaskWearing, val_evaluator_MountainDewCommercial,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_openPoetryVision,
+    val_evaluator_OxfordPets_by_breed, val_evaluator_OxfordPets_by_species,
+    val_evaluator_PKLot, val_evaluator_Packages, val_evaluator_PascalVOC,
+    val_evaluator_pistols, val_evaluator_plantdoc, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_selfdrivingCar,
+    val_evaluator_ShellfishOpenImages, val_evaluator_ThermalCheetah,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_UnoCards,
+    val_evaluator_VehiclesOpenImages, val_evaluator_WildfireSmoke,
+    val_evaluator_websiteScreenshots
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/configs/mm_grounding_dino/odinw/override_category.py b/configs/mm_grounding_dino/odinw/override_category.py
new file mode 100644
index 00000000000..9ff05fc6e5e
--- /dev/null
+++ b/configs/mm_grounding_dino/odinw/override_category.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import mmengine
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Override Category')
+    parser.add_argument('data_root')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    ChessPieces = [{
+        'id': 1,
+        'name': '  ',
+        'supercategory': 'pieces'
+    }, {
+        'id': 2,
+        'name': 'black bishop',
+        'supercategory': 'pieces'
+    }, {
+        'id': 3,
+        'name': 'black king',
+        'supercategory': 'pieces'
+    }, {
+        'id': 4,
+        'name': 'black knight',
+        'supercategory': 'pieces'
+    }, {
+        'id': 5,
+        'name': 'black pawn',
+        'supercategory': 'pieces'
+    }, {
+        'id': 6,
+        'name': 'black queen',
+        'supercategory': 'pieces'
+    }, {
+        'id': 7,
+        'name': 'black rook',
+        'supercategory': 'pieces'
+    }, {
+        'id': 8,
+        'name': 'white bishop',
+        'supercategory': 'pieces'
+    }, {
+        'id': 9,
+        'name': 'white king',
+        'supercategory': 'pieces'
+    }, {
+        'id': 10,
+        'name': 'white knight',
+        'supercategory': 'pieces'
+    }, {
+        'id': 11,
+        'name': 'white pawn',
+        'supercategory': 'pieces'
+    }, {
+        'id': 12,
+        'name': 'white queen',
+        'supercategory': 'pieces'
+    }, {
+        'id': 13,
+        'name': 'white rook',
+        'supercategory': 'pieces'
+    }]
+
+    _data_root = args.data_root + 'ChessPieces/Chess Pieces.v23-raw.coco/'
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = ChessPieces
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+    CottontailRabbits = [{
+        'id': 1,
+        'name': 'rabbit',
+        'supercategory': 'Cottontail-Rabbit'
+    }]
+
+    _data_root = args.data_root + 'CottontailRabbits/'
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = CottontailRabbits
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+    NorthAmericaMushrooms = [{
+        'id': 1,
+        'name': 'flat mushroom',
+        'supercategory': 'mushroom'
+    }, {
+        'id': 2,
+        'name': 'yellow mushroom',
+        'supercategory': 'mushroom'
+    }]
+
+    _data_root = args.data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+    json_data = mmengine.load(_data_root +
+                              'valid/annotations_without_background.json')
+    json_data['categories'] = NorthAmericaMushrooms
+    mmengine.dump(json_data,
+                  _data_root + 'valid/new_annotations_without_background.json')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/configs/mm_grounding_dino/people_in_painting/grounding_dino_swin-t_finetune_8xb4_50e_people_in_painting.py b/configs/mm_grounding_dino/people_in_painting/grounding_dino_swin-t_finetune_8xb4_50e_people_in_painting.py
new file mode 100644
index 00000000000..ae9617ef30f
--- /dev/null
+++ b/configs/mm_grounding_dino/people_in_painting/grounding_dino_swin-t_finetune_8xb4_50e_people_in_painting.py
@@ -0,0 +1,109 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+# https://universe.roboflow.com/roboflow-100/people-in-paintings/dataset/2
+data_root = 'data/people_in_painting_v2/'
+class_name = ('Human', )
+palette = [(220, 20, 60)]
+
+metainfo = dict(classes=class_name, palette=palette)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+train_dataloader = dict(
+    sampler=dict(_delete_=True, type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        _delete_=True,
+        type='RepeatDataset',
+        times=10,
+        dataset=dict(
+            type='CocoDataset',
+            data_root=data_root,
+            metainfo=metainfo,
+            filter_cfg=dict(filter_empty_gt=False, min_size=32),
+            pipeline=train_pipeline,
+            return_classes=True,
+            data_prefix=dict(img='train/'),
+            ann_file='train/_annotations.coco.json')))
+
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        return_classes=True,
+        ann_file='valid/_annotations.coco.json',
+        data_prefix=dict(img='valid/')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'valid/_annotations.coco.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'absolute_pos_embed': dict(decay_mult=0.),
+        'backbone': dict(lr_mult=0.1)
+    }))
+
+# learning policy
+max_epochs = 5
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[4],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = ''
diff --git a/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_grefcoco.py b/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_grefcoco.py
new file mode 100644
index 00000000000..a6ce25e904d
--- /dev/null
+++ b/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_grefcoco.py
@@ -0,0 +1,169 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    # change this
+    dict(type='RandomFlip', prob=0.0),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='ODVGDataset',
+        data_root=data_root,
+        ann_file='mdetr_annotations/finetune_grefcoco_train_vg.json',
+        data_prefix=dict(img='train2014/'),
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        return_classes=True,
+        pipeline=train_pipeline))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_val.json'
+val_dataset_all_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+val_evaluator_all_val = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_testA.json'
+val_dataset_refcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testA = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_testB.json'
+val_dataset_refcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testB = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+datasets = [
+    val_dataset_all_val, val_dataset_refcoco_testA, val_dataset_refcoco_testB
+]
+dataset_prefixes = ['grefcoco_val', 'grefcoco_testA', 'grefcoco_testB']
+metrics = [
+    val_evaluator_all_val, val_evaluator_refcoco_testA,
+    val_evaluator_refcoco_testB
+]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            # 'language_model': dict(lr_mult=0),
+        }))
+
+# learning policy
+max_epochs = 5
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[3],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = ''
diff --git a/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_refcoco.py b/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_refcoco.py
new file mode 100644
index 00000000000..d26bf98c0f7
--- /dev/null
+++ b/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_refcoco.py
@@ -0,0 +1,169 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    # change this
+    dict(type='RandomFlip', prob=0.0),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='ODVGDataset',
+        data_root=data_root,
+        ann_file='mdetr_annotations/finetune_refcoco_train_vg.json',
+        data_prefix=dict(img='train2014/'),
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        return_classes=True,
+        pipeline=train_pipeline))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco_val.json'
+val_dataset_all_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+val_evaluator_all_val = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco_testA.json'
+val_dataset_refcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testA = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco_testB.json'
+val_dataset_refcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testB = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+datasets = [
+    val_dataset_all_val, val_dataset_refcoco_testA, val_dataset_refcoco_testB
+]
+dataset_prefixes = ['refcoco_val', 'refcoco_testA', 'refcoco_testB']
+metrics = [
+    val_evaluator_all_val, val_evaluator_refcoco_testA,
+    val_evaluator_refcoco_testB
+]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            # 'language_model': dict(lr_mult=0),
+        }))
+
+# learning policy
+max_epochs = 5
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[3],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = ''
diff --git a/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_refcoco_plus.py b/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_refcoco_plus.py
new file mode 100644
index 00000000000..ff084b8c514
--- /dev/null
+++ b/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_refcoco_plus.py
@@ -0,0 +1,169 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    # change this
+    dict(type='RandomFlip', prob=0.0),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='ODVGDataset',
+        data_root=data_root,
+        ann_file='mdetr_annotations/finetune_refcoco+_train_vg.json',
+        data_prefix=dict(img='train2014/'),
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        return_classes=True,
+        pipeline=train_pipeline))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco+_val.json'
+val_dataset_all_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+val_evaluator_all_val = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco+_testA.json'
+val_dataset_refcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testA = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco+_testB.json'
+val_dataset_refcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testB = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+datasets = [
+    val_dataset_all_val, val_dataset_refcoco_testA, val_dataset_refcoco_testB
+]
+dataset_prefixes = ['refcoco+_val', 'refcoco+_testA', 'refcoco+_testB']
+metrics = [
+    val_evaluator_all_val, val_evaluator_refcoco_testA,
+    val_evaluator_refcoco_testB
+]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            # 'language_model': dict(lr_mult=0),
+        }))
+
+# learning policy
+max_epochs = 5
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[3],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = ''
diff --git a/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_refcocog.py b/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_refcocog.py
new file mode 100644
index 00000000000..79ec375c756
--- /dev/null
+++ b/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_finetune_refcocog.py
@@ -0,0 +1,169 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/coco/'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=_base_.backend_args),
+    dict(type='LoadAnnotations', with_bbox=True),
+    # change this
+    dict(type='RandomFlip', prob=0.0),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(
+        type='RandomSamplingNegPos',
+        tokenizer_name=_base_.lang_model_name,
+        num_sample_negative=85,
+        max_tokens=256),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities', 'tokens_positive', 'dataset_mode'))
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        _delete_=True,
+        type='ODVGDataset',
+        data_root=data_root,
+        ann_file='mdetr_annotations/finetune_refcocog_train_vg.json',
+        data_prefix=dict(img='train2014/'),
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        return_classes=True,
+        pipeline=train_pipeline))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcocog_val.json'
+val_dataset_all_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+val_evaluator_all_val = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcocog_testA.json'
+val_dataset_refcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testA = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcocog_testB.json'
+val_dataset_refcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=_base_.test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testB = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+datasets = [
+    val_dataset_all_val, val_dataset_refcoco_testA, val_dataset_refcoco_testB
+]
+dataset_prefixes = ['refcocog_val', 'refcocog_testA', 'refcocog_testB']
+metrics = [
+    val_evaluator_all_val, val_evaluator_refcoco_testA,
+    val_evaluator_refcoco_testB
+]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'backbone': dict(lr_mult=0.1),
+            # 'language_model': dict(lr_mult=0),
+        }))
+
+# learning policy
+max_epochs = 5
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[3],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = ''
diff --git a/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_pretrain_zeroshot_refexp.py b/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_pretrain_zeroshot_refexp.py
new file mode 100644
index 00000000000..437d71c6b35
--- /dev/null
+++ b/configs/mm_grounding_dino/refcoco/grounding_dino_swin-t_pretrain_zeroshot_refexp.py
@@ -0,0 +1,228 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+# 30 is an empirical value, just set it to the maximum value
+# without affecting the evaluation result
+model = dict(test_cfg=dict(max_per_img=30))
+
+data_root = 'data/coco/'
+
+test_pipeline = [
+    dict(
+        type='LoadImageFromFile', backend_args=None,
+        imdecode_backend='pillow'),
+    dict(
+        type='FixScaleResize',
+        scale=(800, 1333),
+        keep_ratio=True,
+        backend='pillow'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'text', 'custom_entities',
+                   'tokens_positive'))
+]
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/final_refexp_val.json'
+val_dataset_all_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+val_evaluator_all_val = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco_testA.json'
+val_dataset_refcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testA = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco_testB.json'
+val_dataset_refcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_testB = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco+_testA.json'
+val_dataset_refcoco_plus_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_plus_testA = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcoco+_testB.json'
+val_dataset_refcoco_plus_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcoco_plus_testB = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_refcocog_test.json'
+val_dataset_refcocog_test = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_refcocog_test = dict(
+    type='RefExpMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    topk=(1, 5, 10))
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_val.json'
+val_dataset_grefcoco_val = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_grefcoco_val = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_testA.json'
+val_dataset_grefcoco_testA = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_grefcoco_testA = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+ann_file = 'mdetr_annotations/finetune_grefcoco_testB.json'
+val_dataset_grefcoco_testB = dict(
+    type='MDETRStyleRefCocoDataset',
+    data_root=data_root,
+    ann_file=ann_file,
+    data_prefix=dict(img='train2014/'),
+    test_mode=True,
+    return_classes=True,
+    pipeline=test_pipeline,
+    backend_args=None)
+
+val_evaluator_grefcoco_testB = dict(
+    type='gRefCOCOMetric',
+    ann_file=data_root + ann_file,
+    metric='bbox',
+    iou_thrs=0.5,
+    thresh_score=0.7,
+    thresh_f1=1.0)
+
+# -------------------------------------------------#
+datasets = [
+    val_dataset_all_val, val_dataset_refcoco_testA, val_dataset_refcoco_testB,
+    val_dataset_refcoco_plus_testA, val_dataset_refcoco_plus_testB,
+    val_dataset_refcocog_test, val_dataset_grefcoco_val,
+    val_dataset_grefcoco_testA, val_dataset_grefcoco_testB
+]
+dataset_prefixes = [
+    'val', 'refcoco_testA', 'refcoco_testB', 'refcoco+_testA',
+    'refcoco+_testB', 'refcocog_test', 'grefcoco_val', 'grefcoco_testA',
+    'grefcoco_testB'
+]
+metrics = [
+    val_evaluator_all_val, val_evaluator_refcoco_testA,
+    val_evaluator_refcoco_testB, val_evaluator_refcoco_plus_testA,
+    val_evaluator_refcoco_plus_testB, val_evaluator_refcocog_test,
+    val_evaluator_grefcoco_val, val_evaluator_grefcoco_testA,
+    val_evaluator_grefcoco_testB
+]
+
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/configs/mm_grounding_dino/rtts/grounding_dino_swin-t_finetune_8xb4_1x_rtts.py b/configs/mm_grounding_dino/rtts/grounding_dino_swin-t_finetune_8xb4_1x_rtts.py
new file mode 100644
index 00000000000..db167f671c1
--- /dev/null
+++ b/configs/mm_grounding_dino/rtts/grounding_dino_swin-t_finetune_8xb4_1x_rtts.py
@@ -0,0 +1,106 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/RTTS/'
+class_name = ('bicycle', 'bus', 'car', 'motorbike', 'person')
+palette = [(255, 97, 0), (0, 201, 87), (176, 23, 31), (138, 43, 226),
+           (30, 144, 255)]
+
+metainfo = dict(classes=class_name, palette=palette)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+train_dataloader = dict(
+    sampler=dict(_delete_=True, type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        _delete_=True,
+        type='CocoDataset',
+        data_root=data_root,
+        metainfo=metainfo,
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        pipeline=train_pipeline,
+        return_classes=True,
+        ann_file='annotations_json/rtts_train.json',
+        data_prefix=dict(img='')))
+
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        return_classes=True,
+        ann_file='annotations_json/rtts_val.json',
+        data_prefix=dict(img='')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'annotations_json/rtts_val.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'absolute_pos_embed': dict(decay_mult=0.),
+        'backbone': dict(lr_mult=0.1)
+    }))
+
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = ''
diff --git a/configs/mm_grounding_dino/ruod/grounding_dino_swin-t_finetune_8xb4_1x_ruod.py b/configs/mm_grounding_dino/ruod/grounding_dino_swin-t_finetune_8xb4_1x_ruod.py
new file mode 100644
index 00000000000..16a6a6cbb7a
--- /dev/null
+++ b/configs/mm_grounding_dino/ruod/grounding_dino_swin-t_finetune_8xb4_1x_ruod.py
@@ -0,0 +1,108 @@
+_base_ = '../grounding_dino_swin-t_pretrain_obj365.py'
+
+data_root = 'data/RUOD/'
+class_name = ('holothurian', 'echinus', 'scallop', 'starfish', 'fish',
+              'corals', 'diver', 'cuttlefish', 'turtle', 'jellyfish')
+palette = [(235, 211, 70), (106, 90, 205), (160, 32, 240), (176, 23, 31),
+           (142, 0, 0), (230, 0, 0), (106, 0, 228), (60, 100, 0), (80, 100, 0),
+           (70, 0, 0)]
+
+metainfo = dict(classes=class_name, palette=palette)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomChoice',
+        transforms=[
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type='RandomChoiceResize',
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type='RandomCrop',
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type='RandomChoiceResize',
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(
+        type='PackDetInputs',
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor', 'flip', 'flip_direction', 'text',
+                   'custom_entities'))
+]
+
+train_dataloader = dict(
+    sampler=dict(_delete_=True, type='DefaultSampler', shuffle=True),
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    dataset=dict(
+        _delete_=True,
+        type='CocoDataset',
+        data_root=data_root,
+        metainfo=metainfo,
+        filter_cfg=dict(filter_empty_gt=False, min_size=32),
+        pipeline=train_pipeline,
+        return_classes=True,
+        ann_file='RUOD_ANN/instances_train.json',
+        data_prefix=dict(img='RUOD_pic/train/')))
+
+val_dataloader = dict(
+    dataset=dict(
+        metainfo=metainfo,
+        data_root=data_root,
+        return_classes=True,
+        ann_file='RUOD_ANN/instances_test.json',
+        data_prefix=dict(img='RUOD_pic/test/')))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='CocoMetric',
+    ann_file=data_root + 'RUOD_ANN/instances_test.json',
+    metric='bbox',
+    format_only=False)
+test_evaluator = val_evaluator
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'absolute_pos_embed': dict(decay_mult=0.),
+        'backbone': dict(lr_mult=0.1)
+    }))
+
+# learning policy
+max_epochs = 12
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+train_cfg = dict(max_epochs=max_epochs, val_interval=1)
+default_hooks = dict(checkpoint=dict(max_keep_ckpts=1, save_best='auto'))
+
+load_from = ''
diff --git a/configs/odinw/README.md b/configs/odinw/README.md
deleted file mode 100644
index 72d95933653..00000000000
--- a/configs/odinw/README.md
+++ /dev/null
@@ -1,106 +0,0 @@
-# ODinW
-
-[ELEVATER: A Benchmark and Toolkit for Evaluating Language-Augmented Visual Models](https://arxiv.org/pdf/2204.08790.pdf)
-
-<!-- [DATASET] -->
-
-## Get Started
-
-1. To download dataset, you can refer to [reference document](../../docs/zh_cn/user_guides/dataset_prepare.md)
-
-2. You can use the following data to run the inference.
-
-```shell
-cd $MMDETROOT
-
-python tools/test.py configs/odinw35/glip_atss_swin_t_fpn_dataset.py checkpoints/glip_tiny_a_mmdet-b3654169.pth
-```
-
-## Abstract
-
-Learning visual representations from natural language supervision has recently shown great promise in a number of pioneering works. In general, these language-augmented visual models demonstrate strong transferability to a variety of datasets and tasks. However, it remains challenging to evaluate the transferablity of these models due to the lack of easy-to-use evaluation toolkits and public benchmarks. To tackle this, we build ELEVATER 1 , the first benchmark and toolkit for evaluating (pre-trained) language-augmented visual models. ELEVATER is composed of three components. (i) Datasets. As downstream evaluation suites, it consists of 20 image classification datasets and 35 object detection datasets, each of which is augmented with external knowledge. (ii) Toolkit. An automatic hyper-parameter tuning toolkit is developed to facilitate model evaluation on downstream tasks. (iii) Metrics. A variety of evaluation metrics are used to measure sample-efficiency (zero-shot and few-shot) and parameter-efficiency (linear probing and full model fine-tuning). ELEVATER is platform for Computer Vision in the Wild (CVinW), and is publicly released at https://computer-vision-in-the-wild.github.io/ELEVATER/
-
-## Results and models of odinw13
-
-| Method                | GLIP-T(A) | Official  | GLIP-T(B) | Official  | GLIP-T(C) | Official  | GroundingDINO-T | GroundingDINO-B |
-| --------------------- | --------- | --------- | --------- | --------- | --------- | --------- | --------------- | --------------- |
-| AerialMaritimeDrone   | 0.123     | 0.122     | 0.110     | 0.110     | 0.130     | 0.130     | 0.173           | 0.281           |
-| Aquarium              | 0.175     | 0.174     | 0.173     | 0.169     | 0.191     | 0.190     | 0.195           | 0.445           |
-| CottontailRabbits     | 0.686     | 0.686     | 0.688     | 0.688     | 0.744     | 0.744     | 0.799           | 0.808           |
-| EgoHands              | 0.013     | 0.013     | 0.003     | 0.004     | 0.314     | 0.315     | 0.608           | 0.764           |
-| NorthAmericaMushrooms | 0.502     | 0.502     | 0.367     | 0.367     | 0.297     | 0.296     | 0.507           | 0.675           |
-| Packages              | 0.589     | 0.589     | 0.083     | 0.083     | 0.699     | 0.699     | 0.687           | 0.670           |
-| PascalVOC             | 0.512     | 0.512     | 0.541     | 0.540     | 0.565     | 0.565     | 0.563           | 0.711           |
-| pistols               | 0.339     | 0.339     | 0.502     | 0.501     | 0.503     | 0.504     | 0.726           | 0.771           |
-| pothole               | 0.007     | 0.007     | 0.030     | 0.030     | 0.058     | 0.058     | 0.215           | 0.478           |
-| Raccoon               | 0.075     | 0.074     | 0.285     | 0.288     | 0.241     | 0.244     | 0.549           | 0.541           |
-| ShellfishOpenImages   | 0.253     | 0.253     | 0.337     | 0.338     | 0.300     | 0.302     | 0.393           | 0.650           |
-| thermalDogsAndPeople  | 0.372     | 0.372     | 0.475     | 0.475     | 0.510     | 0.510     | 0.657           | 0.633           |
-| VehiclesOpenImages    | 0.574     | 0.566     | 0.562     | 0.547     | 0.549     | 0.534     | 0.613           | 0.647           |
-| Average               | **0.325** | **0.324** | **0.320** | **0.318** | **0.392** | **0.392** | **0.514**       | **0.621**       |
-
-Note:
-
-1. The above are zero-shot evaluation results.
-2. The config and weights of GLIPs models can be found at [here](../glip/README.md)
-3. The config and weights of GroundingDINO models can be found at [here](../grounding_dino/README.md)
-
-## Results and models of odinw35
-
-| Method                      | GLIP-T(A) | Official  | GLIP-T(B) | Official  | GLIP-T(C) | Official  | GroundingDINO-T | GroundingDINO-B |
-| --------------------------- | --------- | --------- | --------- | --------- | --------- | --------- | --------------- | --------------- |
-| AerialMaritimeDrone_large   | 0.123     | 0.122     | 0.110     | 0.110     | 0.130     | 0.130     | 0.173           | 0.281           |
-| AerialMaritimeDrone_tiled   | 0.174     | 0.174     | 0.172     | 0.172     | 0.172     | 0.172     | 0.206           | 0.364           |
-| AmericanSignLanguageLetters | 0.001     | 0.001     | 0.003     | 0.003     | 0.009     | 0.009     | 0.002           | 0.096           |
-| Aquarium                    | 0.175     | 0.175     | 0.173     | 0.171     | 0.192     | 0.182     | 0.195           | 0.445           |
-| BCCD                        | 0.016     | 0.016     | 0.001     | 0.001     | 0.000     | 0.000     | 0.161           | 0.584           |
-| boggleBoards                | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000           | 0.134           |
-| brackishUnderwater          | 0.016     | 0..013    | 0.021     | 0.027     | 0.020     | 0.022     | 0.021           | 0.454           |
-| ChessPieces                 | 0.001     | 0.001     | 0.000     | 0.000     | 0.001     | 0.001     | 0.000           | 0.000           |
-| CottontailRabbits           | 0.710     | 0.709     | 0.683     | 0.683     | 0.752     | 0.752     | 0.806           | 0.797           |
-| dice                        | 0.005     | 0.005     | 0.004     | 0.004     | 0.004     | 0.004     | 0.004           | 0.082           |
-| DroneControl                | 0.016     | 0.017     | 0.006     | 0.008     | 0.005     | 0.007     | 0.042           | 0.638           |
-| EgoHands_generic            | 0.009     | 0.010     | 0.005     | 0.006     | 0.510     | 0.508     | 0.608           | 0.764           |
-| EgoHands_specific           | 0.001     | 0.001     | 0.004     | 0.006     | 0.003     | 0.004     | 0.002           | 0.687           |
-| HardHatWorkers              | 0.029     | 0.029     | 0.023     | 0.023     | 0.033     | 0.033     | 0.046           | 0.439           |
-| MaskWearing                 | 0.007     | 0.007     | 0.003     | 0.002     | 0.005     | 0.005     | 0.004           | 0.406           |
-| MountainDewCommercial       | 0.218     | 0.227     | 0.199     | 0.197     | 0.478     | 0.463     | 0.430           | 0.580           |
-| NorthAmericaMushrooms       | 0.502     | 0.502     | 0.450     | 0.450     | 0.497     | 0.497     | 0.471           | 0.501           |
-| openPoetryVision            | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000           | 0.051           |
-| OxfordPets_by_breed         | 0.001     | 0.002     | 0.002     | 0.004     | 0.001     | 0.002     | 0.003           | 0.799           |
-| OxfordPets_by_species       | 0.016     | 0.011     | 0.012     | 0.009     | 0.013     | 0.009     | 0.011           | 0.872           |
-| PKLot                       | 0.002     | 0.002     | 0.000     | 0.000     | 0.000     | 0.000     | 0.001           | 0.774           |
-| Packages                    | 0.569     | 0.569     | 0.279     | 0.279     | 0.712     | 0.712     | 0.695           | 0.728           |
-| PascalVOC                   | 0.512     | 0.512     | 0.541     | 0.540     | 0.565     | 0.565     | 0.563           | 0.711           |
-| pistols                     | 0.339     | 0.339     | 0.502     | 0.501     | 0.503     | 0.504     | 0.726           | 0.771           |
-| plantdoc                    | 0.002     | 0.002     | 0.007     | 0.007     | 0.009     | 0.009     | 0.005           | 0.376           |
-| pothole                     | 0.007     | 0.010     | 0.024     | 0.025     | 0.085     | 0.101     | 0.215           | 0.478           |
-| Raccoons                    | 0.075     | 0.074     | 0.285     | 0.288     | 0.241     | 0.244     | 0.549           | 0.541           |
-| selfdrivingCar              | 0.071     | 0.072     | 0.074     | 0.074     | 0.081     | 0.080     | 0.089           | 0.318           |
-| ShellfishOpenImages         | 0.253     | 0.253     | 0.337     | 0.338     | 0.300     | 0.302     | 0.393           | 0.650           |
-| ThermalCheetah              | 0.028     | 0.028     | 0.000     | 0.000     | 0.028     | 0.028     | 0.087           | 0.290           |
-| thermalDogsAndPeople        | 0.372     | 0.372     | 0.475     | 0.475     | 0.510     | 0.510     | 0.657           | 0.633           |
-| UnoCards                    | 0.000     | 0.000     | 0.000     | 0.001     | 0.002     | 0.003     | 0.006           | 0.754           |
-| VehiclesOpenImages          | 0.574     | 0.566     | 0.562     | 0.547     | 0.549     | 0.534     | 0.613           | 0.647           |
-| WildfireSmoke               | 0.000     | 0.000     | 0.000     | 0.000     | 0.017     | 0.017     | 0.134           | 0.410           |
-| websiteScreenshots          | 0.003     | 0.004     | 0.003     | 0.005     | 0.005     | 0.006     | 0.012           | 0.175           |
-| Average                     | **0.134** | **0.134** | **0.138** | **0.138** | **0.179** | **0.178** | **0.227**       | **0.492**       |
-
-Note:
-
-1. The above are zero-shot evaluation results.
-2. The config and weights of GLIPs models can be found at [here](../glip/README.md)
-3. The config and weights of GroundingDINO models can be found at [here](../grounding_dino/README.md)
-
-## Citation
-
-```
-@misc{li2022elevater,
-      title={ELEVATER: A Benchmark and Toolkit for Evaluating Language-Augmented Visual Models},
-      author={Chunyuan Li and Haotian Liu and Liunian Harold Li and Pengchuan Zhang and Jyoti Aneja and Jianwei Yang and Ping Jin and Houdong Hu and Zicheng Liu and Yong Jae Lee and Jianfeng Gao},
-      year={2022},
-      eprint={2204.08790},
-      archivePrefix={arXiv},
-      primaryClass={cs.CV}
-}
-```
diff --git a/demo/image_demo.py b/demo/image_demo.py
index 5a9c906cef0..1f994cb40ea 100644
--- a/demo/image_demo.py
+++ b/demo/image_demo.py
@@ -37,6 +37,15 @@
         --texts '$: lvis' --pred-score-thr 0.7 \
         --palette random --chunked-size 80
 
+        python demo/image_demo.py demo/demo.jpg \
+        grounding_dino_swin-t_pretrain_obj365_goldg_cap4m \
+        --texts '$: lvis' --pred-score-thr 0.4 \
+        --palette random --chunked-size 80
+
+        python demo/image_demo.py demo/demo.jpg \
+        grounding_dino_swin-t_pretrain_obj365_goldg_cap4m \
+        --texts "a red car in the upper right corner" \
+        --tokens-positive -1
 
     Visualize prediction results::
 
@@ -46,6 +55,7 @@
         --show
 """
 
+import ast
 from argparse import ArgumentParser
 
 from mmengine.logging import print_log
@@ -122,6 +132,15 @@ def parse_args():
         default=-1,
         help='If the number of categories is very large, '
         'you can specify this parameter to truncate multiple predictions.')
+    # only for Grounding DINO
+    parser.add_argument(
+        '--tokens-positive',
+        '-p',
+        type=str,
+        help='Used to specify which locations in the input text are of '
+        'interest to the user. -1 indicates that no area is of interest, '
+        'None indicates ignoring this parameter. '
+        'The two-dimensional array represents the start and end positions.')
 
     call_args = vars(parser.parse_args())
 
@@ -140,6 +159,10 @@ def parse_args():
             class_names = get_classes(dataset_name)
             call_args['texts'] = [tuple(class_names)]
 
+    if call_args['tokens_positive'] is not None:
+        call_args['tokens_positive'] = ast.literal_eval(
+            call_args['tokens_positive'])
+
     init_kws = ['model', 'weights', 'device', 'palette']
     init_args = {}
     for init_kw in init_kws:
diff --git a/mmdet/apis/det_inferencer.py b/mmdet/apis/det_inferencer.py
index 9efbb00cbe9..ce8532eb786 100644
--- a/mmdet/apis/det_inferencer.py
+++ b/mmdet/apis/det_inferencer.py
@@ -313,8 +313,10 @@ def __call__(
             texts: Optional[Union[str, list]] = None,
             # by open panoptic task
             stuff_texts: Optional[Union[str, list]] = None,
-            # by GLIP
+            # by GLIP and Grounding DINO
             custom_entities: bool = False,
+            # by Grounding DINO
+            tokens_positive: Optional[Union[int, list]] = None,
             **kwargs) -> dict:
         """Call the inferencer.
 
@@ -343,7 +345,7 @@ def __call__(
             stuff_texts (str | list[str]): Stuff text prompts of open
                 panoptic task. Defaults to None.
             custom_entities (bool): Whether to use custom entities.
-                Defaults to False. Only used in GLIP.
+                Defaults to False. Only used in GLIP and Grounding DINO.
             **kwargs: Other keyword arguments passed to :meth:`preprocess`,
                 :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
                 Each key in kwargs should be in the corresponding set of
@@ -366,6 +368,10 @@ def __call__(
             texts = [texts] * len(ori_inputs)
         if stuff_texts is not None and isinstance(stuff_texts, str):
             stuff_texts = [stuff_texts] * len(ori_inputs)
+
+        # Currently only supports bs=1
+        tokens_positive = [tokens_positive] * len(ori_inputs)
+
         if texts is not None:
             assert len(texts) == len(ori_inputs)
             for i in range(len(texts)):
@@ -373,13 +379,15 @@ def __call__(
                     ori_inputs[i] = {
                         'text': texts[i],
                         'img_path': ori_inputs[i],
-                        'custom_entities': custom_entities
+                        'custom_entities': custom_entities,
+                        'tokens_positive': tokens_positive[i]
                     }
                 else:
                     ori_inputs[i] = {
                         'text': texts[i],
                         'img': ori_inputs[i],
-                        'custom_entities': custom_entities
+                        'custom_entities': custom_entities,
+                        'tokens_positive': tokens_positive[i]
                     }
         if stuff_texts is not None:
             assert len(stuff_texts) == len(ori_inputs)
diff --git a/mmdet/datasets/__init__.py b/mmdet/datasets/__init__.py
index 044efe4cad7..670c207cacf 100644
--- a/mmdet/datasets/__init__.py
+++ b/mmdet/datasets/__init__.py
@@ -12,17 +12,22 @@
 from .crowdhuman import CrowdHumanDataset
 from .dataset_wrappers import ConcatDataset, MultiImageMixDataset
 from .deepfashion import DeepFashionDataset
+from .dod import DODDataset
 from .dsdl import DSDLDetDataset
+from .flickr30k import Flickr30kDataset
 from .isaid import iSAIDDataset
 from .lvis import LVISDataset, LVISV1Dataset, LVISV05Dataset
+from .mdetr_style_refcoco import MDETRStyleRefCocoDataset
 from .mot_challenge_dataset import MOTChallengeDataset
 from .objects365 import Objects365V1Dataset, Objects365V2Dataset
+from .odvg import ODVGDataset
 from .openimages import OpenImagesChallengeDataset, OpenImagesDataset
 from .refcoco import RefCocoDataset
 from .reid_dataset import ReIDDataset
 from .samplers import (AspectRatioBatchSampler, ClassAwareSampler,
-                       GroupMultiSourceSampler, MultiSourceSampler,
-                       TrackAspectRatioBatchSampler, TrackImgSampler)
+                       CustomSampleSizeSampler, GroupMultiSourceSampler,
+                       MultiSourceSampler, TrackAspectRatioBatchSampler,
+                       TrackImgSampler)
 from .utils import get_loading_pipeline
 from .v3det import V3DetDataset
 from .voc import VOCDataset
@@ -42,5 +47,7 @@
     'ReIDDataset', 'YouTubeVISDataset', 'TrackAspectRatioBatchSampler',
     'ADE20KPanopticDataset', 'CocoCaptionDataset', 'RefCocoDataset',
     'BaseSegDataset', 'ADE20KSegDataset', 'CocoSegDataset',
-    'ADE20KInstanceDataset', 'iSAIDDataset', 'V3DetDataset', 'ConcatDataset'
+    'ADE20KInstanceDataset', 'iSAIDDataset', 'V3DetDataset', 'ConcatDataset',
+    'ODVGDataset', 'MDETRStyleRefCocoDataset', 'DODDataset',
+    'CustomSampleSizeSampler', 'Flickr30kDataset'
 ]
diff --git a/mmdet/datasets/dod.py b/mmdet/datasets/dod.py
new file mode 100644
index 00000000000..152d32aaf70
--- /dev/null
+++ b/mmdet/datasets/dod.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List, Optional
+
+import numpy as np
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+try:
+    from d_cube import D3
+except ImportError:
+    D3 = None
+from .api_wrappers import COCO
+
+
+@DATASETS.register_module()
+class DODDataset(BaseDetDataset):
+
+    def __init__(self,
+                 *args,
+                 data_root: Optional[str] = '',
+                 data_prefix: dict = dict(img_path=''),
+                 **kwargs) -> None:
+        if D3 is None:
+            raise ImportError(
+                'Please install d3 by `pip install ddd-dataset`.')
+        pkl_anno_path = osp.join(data_root, data_prefix['anno'])
+        self.img_root = osp.join(data_root, data_prefix['img'])
+        self.d3 = D3(self.img_root, pkl_anno_path)
+
+        sent_infos = self.d3.load_sents()
+        classes = tuple([sent_info['raw_sent'] for sent_info in sent_infos])
+        super().__init__(
+            *args,
+            data_root=data_root,
+            data_prefix=data_prefix,
+            metainfo={'classes': classes},
+            **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        coco = COCO(self.ann_file)
+        data_list = []
+        img_ids = self.d3.get_img_ids()
+        for img_id in img_ids:
+            data_info = {}
+
+            img_info = self.d3.load_imgs(img_id)[0]
+            file_name = img_info['file_name']
+            img_path = osp.join(self.img_root, file_name)
+            data_info['img_path'] = img_path
+            data_info['img_id'] = img_id
+            data_info['height'] = img_info['height']
+            data_info['width'] = img_info['width']
+
+            group_ids = self.d3.get_group_ids(img_ids=[img_id])
+            sent_ids = self.d3.get_sent_ids(group_ids=group_ids)
+            sent_list = self.d3.load_sents(sent_ids=sent_ids)
+            text_list = [sent['raw_sent'] for sent in sent_list]
+            ann_ids = coco.get_ann_ids(img_ids=[img_id])
+            anno = coco.load_anns(ann_ids)
+
+            data_info['text'] = text_list
+            data_info['sent_ids'] = np.array([s for s in sent_ids])
+            data_info['custom_entities'] = True
+
+            instances = []
+            for i, ann in enumerate(anno):
+                instance = {}
+                x1, y1, w, h = ann['bbox']
+                bbox = [x1, y1, x1 + w, y1 + h]
+                instance['ignore_flag'] = 0
+                instance['bbox'] = bbox
+                instance['bbox_label'] = ann['category_id'] - 1
+                instances.append(instance)
+            data_info['instances'] = instances
+            data_list.append(data_info)
+        return data_list
diff --git a/mmdet/datasets/flickr30k.py b/mmdet/datasets/flickr30k.py
new file mode 100644
index 00000000000..705873a3ffb
--- /dev/null
+++ b/mmdet/datasets/flickr30k.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+from pycocotools.coco import COCO
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class Flickr30kDataset(BaseDetDataset):
+    """Flickr30K Dataset."""
+
+    def convert_phrase_ids(self, a):
+        unique_elements = sorted(set(a))
+        element_to_new_label = {
+            element: label
+            for label, element in enumerate(unique_elements)
+        }
+        discreticed_a = [element_to_new_label[element] for element in a]
+        return discreticed_a
+
+    def load_data_list(self) -> List[dict]:
+
+        self.coco = COCO(self.ann_file)
+
+        self.ids = sorted(list(self.coco.imgs.keys()))
+
+        data_list = []
+        for img_id in self.ids:
+            if isinstance(img_id, str):
+                ann_ids = self.coco.getAnnIds(imgIds=[img_id], iscrowd=None)
+            else:
+                ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
+
+            coco_img = self.coco.loadImgs(img_id)[0]
+
+            caption = coco_img['caption']
+            file_name = coco_img['file_name']
+            img_path = osp.join(self.data_prefix['img'], file_name)
+            width = coco_img['width']
+            height = coco_img['height']
+            tokens_positive = coco_img['tokens_positive_eval']
+            phrases = [caption[i[0][0]:i[0][1]] for i in tokens_positive]
+            phrase_ids = []
+
+            instances = []
+            annos = self.coco.loadAnns(ann_ids)
+            for anno in annos:
+                instance = {}
+                instance['bbox'] = [
+                    anno['bbox'][0], anno['bbox'][1],
+                    anno['bbox'][0] + anno['bbox'][2],
+                    anno['bbox'][1] + anno['bbox'][3]
+                ]
+                instance['bbox_label'] = anno['category_id']
+                instance['ignore_flag'] = anno['iscrowd']
+                phrase_ids.append(anno['phrase_ids'])
+                instances.append(instance)
+
+            phrase_ids = self.convert_phrase_ids(phrase_ids)
+
+            data_list.append(
+                dict(
+                    img_path=img_path,
+                    img_id=img_id,
+                    height=height,
+                    width=width,
+                    instances=instances,
+                    text=caption,
+                    phrase_ids=phrase_ids,
+                    tokens_positive=tokens_positive,
+                    phrases=phrases,
+                ))
+
+        return data_list
diff --git a/mmdet/datasets/mdetr_style_refcoco.py b/mmdet/datasets/mdetr_style_refcoco.py
new file mode 100644
index 00000000000..cc56dec49db
--- /dev/null
+++ b/mmdet/datasets/mdetr_style_refcoco.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+from mmengine.fileio import get_local_path
+
+from mmdet.datasets import BaseDetDataset
+from mmdet.registry import DATASETS
+from .api_wrappers import COCO
+
+
+@DATASETS.register_module()
+class MDETRStyleRefCocoDataset(BaseDetDataset):
+    """RefCOCO dataset.
+
+    Only support evaluation now.
+    """
+
+    def load_data_list(self) -> List[dict]:
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            coco = COCO(local_path)
+
+        img_ids = coco.get_img_ids()
+
+        data_infos = []
+        for img_id in img_ids:
+            raw_img_info = coco.load_imgs([img_id])[0]
+            ann_ids = coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = coco.load_anns(ann_ids)
+
+            data_info = {}
+            img_path = osp.join(self.data_prefix['img'],
+                                raw_img_info['file_name'])
+            data_info['img_path'] = img_path
+            data_info['img_id'] = img_id
+            data_info['height'] = raw_img_info['height']
+            data_info['width'] = raw_img_info['width']
+            data_info['dataset_mode'] = raw_img_info['dataset_name']
+
+            data_info['text'] = raw_img_info['caption']
+            data_info['custom_entities'] = False
+            data_info['tokens_positive'] = -1
+
+            instances = []
+            for i, ann in enumerate(raw_ann_info):
+                instance = {}
+                x1, y1, w, h = ann['bbox']
+                bbox = [x1, y1, x1 + w, y1 + h]
+                instance['bbox'] = bbox
+                instance['bbox_label'] = ann['category_id']
+                instance['ignore_flag'] = 0
+                instances.append(instance)
+
+            data_info['instances'] = instances
+            data_infos.append(data_info)
+        return data_infos
diff --git a/mmdet/datasets/odvg.py b/mmdet/datasets/odvg.py
new file mode 100644
index 00000000000..82c8aa1d0ab
--- /dev/null
+++ b/mmdet/datasets/odvg.py
@@ -0,0 +1,107 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from typing import List, Optional
+
+from mmengine.fileio import get_local_path
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class ODVGDataset(BaseDetDataset):
+    """object detection and visual grounding dataset."""
+
+    def __init__(self,
+                 *args,
+                 data_root: str = '',
+                 label_map_file: Optional[str] = None,
+                 need_text: bool = True,
+                 **kwargs) -> None:
+        self.dataset_mode = 'VG'
+        self.need_text = need_text
+        if label_map_file:
+            label_map_file = osp.join(data_root, label_map_file)
+            with open(label_map_file, 'r') as file:
+                self.label_map = json.load(file)
+            self.dataset_mode = 'OD'
+        super().__init__(*args, data_root=data_root, **kwargs)
+        assert self.return_classes is True
+
+    def load_data_list(self) -> List[dict]:
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                data_list = [json.loads(line) for line in f]
+
+        out_data_list = []
+        for data in data_list:
+            data_info = {}
+            img_path = osp.join(self.data_prefix['img'], data['filename'])
+            data_info['img_path'] = img_path
+            data_info['height'] = data['height']
+            data_info['width'] = data['width']
+            if self.dataset_mode == 'OD':
+                if self.need_text:
+                    data_info['text'] = self.label_map
+                anno = data['detection']
+                instances = [obj for obj in anno['instances']]
+                bboxes = [obj['bbox'] for obj in instances]
+                bbox_labels = [str(obj['label']) for obj in instances]
+
+                instances = []
+                for bbox, label in zip(bboxes, bbox_labels):
+                    instance = {}
+                    x1, y1, x2, y2 = bbox
+                    inter_w = max(0, min(x2, data['width']) - max(x1, 0))
+                    inter_h = max(0, min(y2, data['height']) - max(y1, 0))
+                    if inter_w * inter_h == 0:
+                        continue
+                    if (x2 - x1) < 1 or (y2 - y1) < 1:
+                        continue
+                    instance['ignore_flag'] = 0
+                    instance['bbox'] = bbox
+                    instance['bbox_label'] = int(label)
+                    instances.append(instance)
+                data_info['instances'] = instances
+                data_info['dataset_mode'] = self.dataset_mode
+                out_data_list.append(data_info)
+            else:
+                anno = data['grounding']
+                data_info['text'] = anno['caption']
+                regions = anno['regions']
+
+                instances = []
+                phrases = {}
+                for i, region in enumerate(regions):
+                    bbox = region['bbox']
+                    phrase = region['phrase']
+                    tokens_positive = region['tokens_positive']
+                    if not isinstance(bbox[0], list):
+                        bbox = [bbox]
+                    for box in bbox:
+                        instance = {}
+                        x1, y1, x2, y2 = box
+                        inter_w = max(0, min(x2, data['width']) - max(x1, 0))
+                        inter_h = max(0, min(y2, data['height']) - max(y1, 0))
+                        if inter_w * inter_h == 0:
+                            continue
+                        if (x2 - x1) < 1 or (y2 - y1) < 1:
+                            continue
+                        instance['ignore_flag'] = 0
+                        instance['bbox'] = box
+                        instance['bbox_label'] = i
+                        # phrase only for vis. tokens_positive is important
+                        phrases[i] = {
+                            'phrase': phrase,
+                            'tokens_positive': tokens_positive
+                        }
+                        instances.append(instance)
+                data_info['instances'] = instances
+                data_info['phrases'] = phrases
+                data_info['dataset_mode'] = self.dataset_mode
+                out_data_list.append(data_info)
+
+        del data_list
+        return out_data_list
diff --git a/mmdet/datasets/samplers/__init__.py b/mmdet/datasets/samplers/__init__.py
index a942ff2199c..9ea0e4cb062 100644
--- a/mmdet/datasets/samplers/__init__.py
+++ b/mmdet/datasets/samplers/__init__.py
@@ -3,6 +3,7 @@
                             MultiDataAspectRatioBatchSampler,
                             TrackAspectRatioBatchSampler)
 from .class_aware_sampler import ClassAwareSampler
+from .custom_sample_size_sampler import CustomSampleSizeSampler
 from .multi_data_sampler import MultiDataSampler
 from .multi_source_sampler import GroupMultiSourceSampler, MultiSourceSampler
 from .track_img_sampler import TrackImgSampler
@@ -11,5 +12,5 @@
     'ClassAwareSampler', 'AspectRatioBatchSampler', 'MultiSourceSampler',
     'GroupMultiSourceSampler', 'TrackImgSampler',
     'TrackAspectRatioBatchSampler', 'MultiDataSampler',
-    'MultiDataAspectRatioBatchSampler'
+    'MultiDataAspectRatioBatchSampler', 'CustomSampleSizeSampler'
 ]
diff --git a/mmdet/datasets/samplers/custom_sample_size_sampler.py b/mmdet/datasets/samplers/custom_sample_size_sampler.py
new file mode 100644
index 00000000000..6bedf6c66be
--- /dev/null
+++ b/mmdet/datasets/samplers/custom_sample_size_sampler.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Iterator, Optional, Sequence, Sized
+
+import torch
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmdet.registry import DATA_SAMPLERS
+from .class_aware_sampler import RandomCycleIter
+
+
+@DATA_SAMPLERS.register_module()
+class CustomSampleSizeSampler(Sampler):
+
+    def __init__(self,
+                 dataset: Sized,
+                 dataset_size: Sequence[int],
+                 ratio_mode: bool = False,
+                 seed: Optional[int] = None,
+                 round_up: bool = True) -> None:
+        assert len(dataset.datasets) == len(dataset_size)
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        if seed is None:
+            seed = sync_random_seed()
+        self.seed = seed
+        self.epoch = 0
+        self.round_up = round_up
+
+        total_size = 0
+        total_size_fake = 0
+        self.dataset_index = []
+        self.dataset_cycle_iter = []
+        new_dataset_size = []
+        for dataset, size in zip(dataset.datasets, dataset_size):
+            self.dataset_index.append(
+                list(range(total_size_fake,
+                           len(dataset) + total_size_fake)))
+            total_size_fake += len(dataset)
+            if size == -1:
+                total_size += len(dataset)
+                self.dataset_cycle_iter.append(None)
+                new_dataset_size.append(-1)
+            else:
+                if ratio_mode:
+                    size = int(size * len(dataset))
+                assert size <= len(
+                    dataset
+                ), f'dataset size {size} is larger than ' \
+                   f'dataset length {len(dataset)}'
+                total_size += size
+                new_dataset_size.append(size)
+
+                g = torch.Generator()
+                g.manual_seed(self.seed)
+                self.dataset_cycle_iter.append(
+                    RandomCycleIter(self.dataset_index[-1], generator=g))
+        self.dataset_size = new_dataset_size
+
+        if self.round_up:
+            self.num_samples = math.ceil(total_size / world_size)
+            self.total_size = self.num_samples * self.world_size
+        else:
+            self.num_samples = math.ceil((total_size - rank) / world_size)
+            self.total_size = total_size
+
+    def __iter__(self) -> Iterator[int]:
+        """Iterate the indices."""
+        # deterministically shuffle based on epoch and seed
+        g = torch.Generator()
+        g.manual_seed(self.seed + self.epoch)
+
+        out_index = []
+        for data_size, data_index, cycle_iter in zip(self.dataset_size,
+                                                     self.dataset_index,
+                                                     self.dataset_cycle_iter):
+            if data_size == -1:
+                out_index += data_index
+            else:
+                index = [next(cycle_iter) for _ in range(data_size)]
+                out_index += index
+
+        index = torch.randperm(len(out_index), generator=g).numpy().tolist()
+        indices = [out_index[i] for i in index]
+
+        if self.round_up:
+            indices = (
+                indices *
+                int(self.total_size / len(indices) + 1))[:self.total_size]
+        indices = indices[self.rank:self.total_size:self.world_size]
+        return iter(indices)
+
+    def __len__(self) -> int:
+        """The number of samples in this rank."""
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        """Sets the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas use a different
+        random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
diff --git a/mmdet/datasets/transforms/__init__.py b/mmdet/datasets/transforms/__init__.py
index 1f30d6c1352..ab3478feb00 100644
--- a/mmdet/datasets/transforms/__init__.py
+++ b/mmdet/datasets/transforms/__init__.py
@@ -13,6 +13,7 @@
                       LoadEmptyAnnotations, LoadImageFromNDArray,
                       LoadMultiChannelImageFromFiles, LoadPanopticAnnotations,
                       LoadProposals, LoadTrackAnnotations)
+from .text_transformers import LoadTextAnnotations, RandomSamplingNegPos
 from .transformers_glip import GTBoxSubOne_GLIP, RandomFlip_GLIP
 from .transforms import (Albu, CachedMixUp, CachedMosaic, CopyPaste, CutOut,
                          Expand, FixScaleResize, FixShapeResize,
@@ -39,5 +40,6 @@
     'FixShapeResize', 'ProposalBroadcaster', 'InferencerLoader',
     'LoadTrackAnnotations', 'BaseFrameSample', 'UniformRefFrameSample',
     'PackTrackInputs', 'PackReIDInputs', 'FixScaleResize',
-    'ResizeShortestEdge', 'GTBoxSubOne_GLIP', 'RandomFlip_GLIP'
+    'ResizeShortestEdge', 'GTBoxSubOne_GLIP', 'RandomFlip_GLIP',
+    'RandomSamplingNegPos', 'LoadTextAnnotations'
 ]
diff --git a/mmdet/datasets/transforms/text_transformers.py b/mmdet/datasets/transforms/text_transformers.py
new file mode 100644
index 00000000000..5a6da2a13fa
--- /dev/null
+++ b/mmdet/datasets/transforms/text_transformers.py
@@ -0,0 +1,252 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import BaseBoxes
+
+try:
+    from transformers import AutoTokenizer
+    from transformers import BertModel as HFBertModel
+except ImportError:
+    AutoTokenizer = None
+    HFBertModel = None
+
+import random
+import re
+
+import numpy as np
+
+
+def clean_name(name):
+    name = re.sub(r'\(.*\)', '', name)
+    name = re.sub(r'_', ' ', name)
+    name = re.sub(r'  ', ' ', name)
+    name = name.lower()
+    return name
+
+
+def check_for_positive_overflow(gt_bboxes, gt_labels, text, tokenizer,
+                                max_tokens):
+    # Check if we have too many positive labels
+    # generate a caption by appending the positive labels
+    positive_label_list = np.unique(gt_labels).tolist()
+    # random shuffule so we can sample different annotations
+    # at different epochs
+    random.shuffle(positive_label_list)
+
+    kept_lables = []
+    length = 0
+
+    for index, label in enumerate(positive_label_list):
+
+        label_text = clean_name(text[str(label)]) + '. '
+
+        tokenized = tokenizer.tokenize(label_text)
+
+        length += len(tokenized)
+
+        if length > max_tokens:
+            break
+        else:
+            kept_lables.append(label)
+
+    keep_box_index = []
+    keep_gt_labels = []
+    for i in range(len(gt_labels)):
+        if gt_labels[i] in kept_lables:
+            keep_box_index.append(i)
+            keep_gt_labels.append(gt_labels[i])
+
+    return gt_bboxes[keep_box_index], np.array(
+        keep_gt_labels, dtype=np.long), length
+
+
+def generate_senetence_given_labels(positive_label_list, negative_label_list,
+                                    text):
+    label_to_positions = {}
+
+    label_list = negative_label_list + positive_label_list
+
+    random.shuffle(label_list)
+
+    pheso_caption = ''
+
+    label_remap_dict = {}
+    for index, label in enumerate(label_list):
+
+        start_index = len(pheso_caption)
+
+        pheso_caption += clean_name(text[str(label)])
+
+        end_index = len(pheso_caption)
+
+        if label in positive_label_list:
+            label_to_positions[index] = [[start_index, end_index]]
+            label_remap_dict[int(label)] = index
+
+        # if index != len(label_list) - 1:
+        #     pheso_caption += '. '
+        pheso_caption += '. '
+
+    return label_to_positions, pheso_caption, label_remap_dict
+
+
+@TRANSFORMS.register_module()
+class RandomSamplingNegPos(BaseTransform):
+
+    def __init__(self,
+                 tokenizer_name,
+                 num_sample_negative=85,
+                 max_tokens=256,
+                 full_sampling_prob=0.5,
+                 label_map_file=None):
+        if AutoTokenizer is None:
+            raise RuntimeError(
+                'transformers is not installed, please install it by: '
+                'pip install transformers.')
+
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        self.num_sample_negative = num_sample_negative
+        self.full_sampling_prob = full_sampling_prob
+        self.max_tokens = max_tokens
+        self.label_map = None
+        if label_map_file:
+            with open(label_map_file, 'r') as file:
+                self.label_map = json.load(file)
+
+    def transform(self, results: dict) -> dict:
+        if 'phrases' in results:
+            return self.vg_aug(results)
+        else:
+            return self.od_aug(results)
+
+    def vg_aug(self, results):
+        gt_bboxes = results['gt_bboxes']
+        if isinstance(gt_bboxes, BaseBoxes):
+            gt_bboxes = gt_bboxes.tensor
+        gt_labels = results['gt_bboxes_labels']
+        text = results['text'].lower().strip()
+        if not text.endswith('.'):
+            text = text + '. '
+
+        phrases = results['phrases']
+        # TODO: add neg
+        positive_label_list = np.unique(gt_labels).tolist()
+        label_to_positions = {}
+        for label in positive_label_list:
+            label_to_positions[label] = phrases[label]['tokens_positive']
+
+        results['gt_bboxes'] = gt_bboxes
+        results['gt_bboxes_labels'] = gt_labels
+
+        results['text'] = text
+        results['tokens_positive'] = label_to_positions
+        return results
+
+    def od_aug(self, results):
+        gt_bboxes = results['gt_bboxes']
+        if isinstance(gt_bboxes, BaseBoxes):
+            gt_bboxes = gt_bboxes.tensor
+        gt_labels = results['gt_bboxes_labels']
+
+        if 'text' not in results:
+            assert self.label_map is not None
+            text = self.label_map
+        else:
+            text = results['text']
+
+        original_box_num = len(gt_labels)
+        # If the category name is in the format of 'a/b' (in object365),
+        # we randomly select one of them.
+        for key, value in text.items():
+            if '/' in value:
+                text[key] = random.choice(value.split('/')).strip()
+
+        gt_bboxes, gt_labels, positive_caption_length = \
+            check_for_positive_overflow(gt_bboxes, gt_labels,
+                                        text, self.tokenizer, self.max_tokens)
+
+        if len(gt_bboxes) < original_box_num:
+            print('WARNING: removed {} boxes due to positive caption overflow'.
+                  format(original_box_num - len(gt_bboxes)))
+
+        valid_negative_indexes = list(text.keys())
+
+        positive_label_list = np.unique(gt_labels).tolist()
+        full_negative = self.num_sample_negative
+
+        if full_negative > len(valid_negative_indexes):
+            full_negative = len(valid_negative_indexes)
+
+        outer_prob = random.random()
+
+        if outer_prob < self.full_sampling_prob:
+            # c. probability_full: add both all positive and all negatives
+            num_negatives = full_negative
+        else:
+            if random.random() < 1.0:
+                num_negatives = np.random.choice(max(1, full_negative)) + 1
+            else:
+                num_negatives = full_negative
+
+        # Keep some negatives
+        negative_label_list = set()
+        if num_negatives != -1:
+            if num_negatives > len(valid_negative_indexes):
+                num_negatives = len(valid_negative_indexes)
+
+            for i in np.random.choice(
+                    valid_negative_indexes, size=num_negatives, replace=False):
+                if i not in positive_label_list:
+                    negative_label_list.add(i)
+
+        random.shuffle(positive_label_list)
+
+        negative_label_list = list(negative_label_list)
+        random.shuffle(negative_label_list)
+
+        negative_max_length = self.max_tokens - positive_caption_length
+        screened_negative_label_list = []
+
+        for negative_label in negative_label_list:
+            label_text = clean_name(text[str(negative_label)]) + '. '
+
+            tokenized = self.tokenizer.tokenize(label_text)
+
+            negative_max_length -= len(tokenized)
+
+            if negative_max_length > 0:
+                screened_negative_label_list.append(negative_label)
+            else:
+                break
+        negative_label_list = screened_negative_label_list
+        label_to_positions, pheso_caption, label_remap_dict = \
+            generate_senetence_given_labels(positive_label_list,
+                                            negative_label_list, text)
+
+        # label remap
+        if len(gt_labels) > 0:
+            gt_labels = np.vectorize(lambda x: label_remap_dict[x])(gt_labels)
+
+        results['gt_bboxes'] = gt_bboxes
+        results['gt_bboxes_labels'] = gt_labels
+
+        results['text'] = pheso_caption
+        results['tokens_positive'] = label_to_positions
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadTextAnnotations(BaseTransform):
+
+    def transform(self, results: dict) -> dict:
+        if 'phrases' in results:
+            tokens_positive = [
+                phrase['tokens_positive']
+                for phrase in results['phrases'].values()
+            ]
+            results['tokens_positive'] = tokens_positive
+        return results
diff --git a/mmdet/evaluation/metrics/__init__.py b/mmdet/evaluation/metrics/__init__.py
index e1ec0e46250..4b61894dbbb 100644
--- a/mmdet/evaluation/metrics/__init__.py
+++ b/mmdet/evaluation/metrics/__init__.py
@@ -7,11 +7,16 @@
 from .coco_panoptic_metric import CocoPanopticMetric
 from .coco_video_metric import CocoVideoMetric
 from .crowdhuman_metric import CrowdHumanMetric
+from .dod_metric import DODCocoMetric
 from .dump_det_results import DumpDetResults
+from .dump_odvg_results import DumpODVGResults
 from .dump_proposals_metric import DumpProposals
+from .flickr30k_metric import Flickr30kMetric
+from .grefcoco_metric import gRefCOCOMetric
 from .lvis_metric import LVISMetric
 from .mot_challenge_metric import MOTChallengeMetric
 from .openimages_metric import OpenImagesMetric
+from .refexp_metric import RefExpMetric
 from .refseg_metric import RefSegMetric
 from .reid_metric import ReIDMetrics
 from .semseg_metric import SemSegMetric
@@ -23,5 +28,6 @@
     'VOCMetric', 'LVISMetric', 'CrowdHumanMetric', 'DumpProposals',
     'CocoOccludedSeparatedMetric', 'DumpDetResults', 'BaseVideoMetric',
     'MOTChallengeMetric', 'CocoVideoMetric', 'ReIDMetrics', 'YouTubeVISMetric',
-    'COCOCaptionMetric', 'SemSegMetric', 'RefSegMetric'
+    'COCOCaptionMetric', 'SemSegMetric', 'RefSegMetric', 'RefExpMetric',
+    'gRefCOCOMetric', 'DODCocoMetric', 'DumpODVGResults', 'Flickr30kMetric'
 ]
diff --git a/mmdet/evaluation/metrics/dod_metric.py b/mmdet/evaluation/metrics/dod_metric.py
new file mode 100644
index 00000000000..b47d07219da
--- /dev/null
+++ b/mmdet/evaluation/metrics/dod_metric.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import List, Optional, Sequence
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger
+
+from mmdet.datasets.api_wrappers import COCO, COCOeval
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class DODCocoMetric(BaseMetric):
+
+    default_prefix: Optional[str] = 'dod'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 outfile_prefix: Optional[str] = None,
+                 backend_args: dict = None,
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.outfile_prefix = outfile_prefix
+        with get_local_path(ann_file, backend_args=backend_args) as local_path:
+            self._coco_api = COCO(local_path)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+
+            result['labels'] = pred['labels'].cpu().numpy()
+            result['labels'] = data_sample['sent_ids'][result['labels']]
+            self.results.append(result)
+
+    def xyxy2xywh(self, bbox: np.ndarray) -> list:
+        """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO
+        evaluation.
+
+        Args:
+            bbox (numpy.ndarray): The bounding boxes, shape (4, ), in
+                ``xyxy`` order.
+
+        Returns:
+            list[float]: The converted bounding boxes, in ``xywh`` order.
+        """
+
+        _bbox: List = bbox.tolist()
+        return [
+            _bbox[0],
+            _bbox[1],
+            _bbox[2] - _bbox[0],
+            _bbox[3] - _bbox[1],
+        ]
+
+    def results2json(self, results: Sequence[dict]) -> list:
+        """Dump the detection results to a COCO style json file.
+
+        There are 3 types of results: proposals, bbox predictions, mask
+        predictions, and they have different data types. This method will
+        automatically recognize the type, and dump them to json files.
+
+        Args:
+            results (Sequence[dict]): Testing results of the
+                dataset.
+
+        Returns:
+            dict: Possible keys are "bbox", "segm", "proposal", and
+            values are corresponding filenames.
+        """
+        bbox_json_results = []
+        for idx, result in enumerate(results):
+            image_id = result.get('img_id', idx)
+            labels = result['labels']
+            bboxes = result['bboxes']
+            scores = result['scores']
+            for i, label in enumerate(labels):
+                data = dict()
+                data['image_id'] = image_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(scores[i])
+                data['category_id'] = label
+                bbox_json_results.append(data)
+        return bbox_json_results
+
+    def compute_metrics(self, results: list) -> dict:
+        logger: MMLogger = MMLogger.get_current_instance()
+        result_files = self.results2json(results)
+        d3_res = self._coco_api.loadRes(result_files)
+        cocoEval = COCOeval(self._coco_api, d3_res, 'bbox')
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        cocoEval.summarize()
+
+        aps = cocoEval.eval['precision'][:, :, :, 0, -1]
+        category_ids = self._coco_api.getCatIds()
+        category_names = [
+            cat['name'] for cat in self._coco_api.loadCats(category_ids)
+        ]
+
+        aps_lens = defaultdict(list)
+        counter_lens = defaultdict(int)
+        for i in range(len(category_names)):
+            ap = aps[:, :, i]
+            ap_value = ap[ap > -1].mean()
+            if not np.isnan(ap_value):
+                len_ref = len(category_names[i].split(' '))
+                aps_lens[len_ref].append(ap_value)
+                counter_lens[len_ref] += 1
+
+        ap_sum_short = sum([sum(aps_lens[i]) for i in range(0, 4)])
+        ap_sum_mid = sum([sum(aps_lens[i]) for i in range(4, 7)])
+        ap_sum_long = sum([sum(aps_lens[i]) for i in range(7, 10)])
+        ap_sum_very_long = sum([
+            sum(aps_lens[i]) for i in range(10,
+                                            max(counter_lens.keys()) + 1)
+        ])
+        c_sum_short = sum([counter_lens[i] for i in range(1, 4)])
+        c_sum_mid = sum([counter_lens[i] for i in range(4, 7)])
+        c_sum_long = sum([counter_lens[i] for i in range(7, 10)])
+        c_sum_very_long = sum(
+            [counter_lens[i] for i in range(10,
+                                            max(counter_lens.keys()) + 1)])
+        map_short = ap_sum_short / c_sum_short
+        map_mid = ap_sum_mid / c_sum_mid
+        map_long = ap_sum_long / c_sum_long
+        map_very_long = ap_sum_very_long / c_sum_very_long
+
+        coco_metric_names = {
+            'mAP': 0,
+            'mAP_50': 1,
+            'mAP_75': 2,
+            'mAP_s': 3,
+            'mAP_m': 4,
+            'mAP_l': 5,
+            'AR@100': 6,
+            'AR@300': 7,
+            'AR@1000': 8,
+            'AR_s@1000': 9,
+            'AR_m@1000': 10,
+            'AR_l@1000': 11
+        }
+        metric_items = ['mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l']
+
+        eval_results = {}
+        for metric_item in metric_items:
+            key = f'{metric_item}'
+            val = cocoEval.stats[coco_metric_names[metric_item]]
+            eval_results[key] = float(f'{round(val, 3)}')
+
+        ap = cocoEval.stats[:6]
+        logger.info(f'mAP_copypaste: {ap[0]:.3f} '
+                    f'{ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                    f'{ap[4]:.3f} {ap[5]:.3f}')
+
+        logger.info(f'mAP over reference length: short - {map_short:.4f}, '
+                    f'mid - {map_mid:.4f}, long - {map_long:.4f}, '
+                    f'very long - {map_very_long:.4f}')
+        eval_results['mAP_short'] = float(f'{round(map_short, 3)}')
+        eval_results['mAP_mid'] = float(f'{round(map_mid, 3)}')
+        eval_results['mAP_long'] = float(f'{round(map_long, 3)}')
+        eval_results['mAP_very_long'] = float(f'{round(map_very_long, 3)}')
+        return eval_results
diff --git a/mmdet/evaluation/metrics/dump_odvg_results.py b/mmdet/evaluation/metrics/dump_odvg_results.py
new file mode 100644
index 00000000000..8bba75a2d73
--- /dev/null
+++ b/mmdet/evaluation/metrics/dump_odvg_results.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Sequence
+
+from mmcv.ops import batched_nms
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import print_log
+
+from mmdet.registry import METRICS
+
+try:
+    import jsonlines
+except ImportError:
+    jsonlines = None
+
+
+@METRICS.register_module()
+class DumpODVGResults(BaseMetric):
+    default_prefix: Optional[str] = 'pl_odvg'
+
+    def __init__(self,
+                 outfile_path,
+                 img_prefix: str,
+                 score_thr: float = 0.1,
+                 collect_device: str = 'cpu',
+                 nms_thr: float = 0.5,
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.outfile_path = outfile_path
+        self.score_thr = score_thr
+        self.img_prefix = img_prefix
+        self.nms_thr = nms_thr
+
+        if jsonlines is None:
+            raise ImportError('Please run "pip install jsonlines" to install '
+                              'this package.')
+
+    def process(self, data_batch: Any, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = {}
+
+            filename = data_sample['img_path']
+            filename = filename.replace(self.img_prefix, '')
+            if filename.startswith('/'):
+                filename = filename[1:]
+            result['filename'] = filename
+
+            height = data_sample['ori_shape'][0]
+            width = data_sample['ori_shape'][1]
+            result['height'] = height
+            result['width'] = width
+
+            caption = data_sample['text']
+            result['grounding'] = {}
+            result['grounding']['caption'] = caption
+
+            pred_instances = data_sample['pred_instances']
+
+            bboxes = pred_instances['bboxes'].cpu()
+            scores = pred_instances['scores'].cpu()
+            labels = pred_instances['labels'].cpu()
+
+            bboxes = bboxes[scores > self.score_thr]
+            labels = labels[scores > self.score_thr]
+            scores = scores[scores > self.score_thr]
+
+            tokens_positive = data_sample['tokens_positive']
+
+            region_list = []
+            for label, positive in enumerate(tokens_positive):
+                pharse = [caption[pos[0]:pos[1]] for pos in positive]
+
+                _bboxes = bboxes[labels == label]
+                _scores = scores[labels == label]
+                det_bboxes, _ = batched_nms(
+                    _bboxes,
+                    _scores,
+                    None,
+                    dict(type='nms', iou_threshold=self.nms_thr),
+                    class_agnostic=True)
+                _scores = det_bboxes[:, -1].numpy().tolist()
+                _bboxes = det_bboxes[:, :-1].numpy().tolist()
+
+                round_bboxes = []
+                for bbox in _bboxes:
+                    round_bboxes.append([round(b, 2) for b in bbox])
+                _scores = [[round(s, 2) for s in _scores]]
+                region = {
+                    'phrase': pharse,
+                    'bbox': round_bboxes,
+                    'score': _scores,
+                    'tokens_positive': positive
+                }
+                region_list.append(region)
+            result['grounding']['regions'] = region_list
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        with jsonlines.open(self.outfile_path, mode='w') as writer:
+            writer.write_all(results)
+        print_log(
+            f'Results has been saved to {self.outfile_path}.',
+            logger='current')
+        return {}
diff --git a/mmdet/evaluation/metrics/flickr30k_metric.py b/mmdet/evaluation/metrics/flickr30k_metric.py
new file mode 100644
index 00000000000..2d2b1e423a1
--- /dev/null
+++ b/mmdet/evaluation/metrics/flickr30k_metric.py
@@ -0,0 +1,168 @@
+# Copyright (c) OpenMMLab. All rights reserved
+from collections import defaultdict
+from typing import Dict, List, Optional, Sequence
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS
+from ..functional import bbox_overlaps
+
+
+class RecallTracker:
+    """Utility class to track recall@k for various k, split by categories."""
+
+    def __init__(self, topk: Sequence[int]):
+        """
+        Parameters:
+           - topk : tuple of ints corresponding to the recalls being
+           tracked (eg, recall@1, recall@10, ...)
+        """
+
+        self.total_byk_bycat: Dict[int, Dict[str, int]] = {
+            k: defaultdict(int)
+            for k in topk
+        }
+        self.positives_byk_bycat: Dict[int, Dict[str, int]] = {
+            k: defaultdict(int)
+            for k in topk
+        }
+
+    def add_positive(self, k: int, category: str):
+        """Log a positive hit @k for given category."""
+        if k not in self.total_byk_bycat:
+            raise RuntimeError(f'{k} is not a valid recall threshold')
+        self.total_byk_bycat[k][category] += 1
+        self.positives_byk_bycat[k][category] += 1
+
+    def add_negative(self, k: int, category: str):
+        """Log a negative hit @k for given category."""
+        if k not in self.total_byk_bycat:
+            raise RuntimeError(f'{k} is not a valid recall threshold')
+        self.total_byk_bycat[k][category] += 1
+
+    def report(self) -> Dict[str, Dict[str, float]]:
+        """Return a condensed report of the results as a dict of dict.
+
+        report[k][cat] is the recall@k for the given category
+        """
+        report: Dict[str, Dict[str, float]] = {}
+        for k in self.total_byk_bycat:
+            assert k in self.positives_byk_bycat
+            report[str(k)] = {
+                cat:
+                self.positives_byk_bycat[k][cat] / self.total_byk_bycat[k][cat]
+                for cat in self.total_byk_bycat[k]
+            }
+        return report
+
+
+@METRICS.register_module()
+class Flickr30kMetric(BaseMetric):
+    """Phrase Grounding Metric."""
+
+    def __init__(
+        self,
+        topk: Sequence[int] = (1, 5, 10, -1),
+        iou_thrs: float = 0.5,
+        merge_boxes: bool = False,
+        collect_device: str = 'cpu',
+        prefix: Optional[str] = None,
+    ) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.iou_thrs = iou_thrs
+        self.topk = topk
+        self.merge = merge_boxes
+
+    def merge_boxes(self, boxes: List[List[int]]) -> List[List[int]]:
+        """Return the boxes corresponding to the smallest enclosing box
+        containing all the provided boxes The boxes are expected in [x1, y1,
+        x2, y2] format."""
+        if len(boxes) == 1:
+            return boxes
+
+        np_boxes = np.asarray(boxes)
+
+        return [[
+            np.boxes[:, 0].min(), np_boxes[:, 1].min(), np_boxes[:, 2].max(),
+            np_boxes[:, 3].max()
+        ]]
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            pred = data_sample['pred_instances']
+            gt = data_sample['gt_instances']['bboxes']
+            gt_label = data_sample['phrase_ids']
+            phrases = data_sample['phrases']
+            assert len(gt) == len(gt_label)
+
+            self.results.append((pred, gt, gt_label, phrases))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        pred_list, gt_list, gt_label_list, phrase_list = zip(*results)
+
+        recall_tracker = RecallTracker(self.topk)
+
+        for pred, gt_boxes, gt_labels, phrases in zip(pred_list, gt_list,
+                                                      gt_label_list,
+                                                      phrase_list):
+            pred_boxes = pred['bboxes'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+            for i, phrase in enumerate(phrases):
+                cur_index = pred_labels == i
+                cur_boxes = pred_boxes[cur_index]
+                tar_index = [
+                    index for index, value in enumerate(gt_labels)
+                    if value == i
+                ]
+                tar_boxes = gt_boxes[tar_index]
+                if self.merge:
+                    tar_boxes = self.merge_boxes(tar_boxes)
+                if len(cur_boxes) == 0:
+                    cur_boxes = [[0., 0., 0., 0.]]
+                ious = bbox_overlaps(
+                    np.asarray(cur_boxes), np.asarray(tar_boxes))
+                for k in self.topk:
+                    maxi = 0
+                    if k == -1:
+                        maxi = ious.max()
+                    else:
+                        assert k > 0
+                        maxi = ious[:k].max()
+                    if maxi >= self.iou_thrs:
+                        recall_tracker.add_positive(k, 'all')
+                        # TODO: do not support class-wise evaluation yet
+                        # for phrase_type in phrase['phrase_type']:
+                        #     recall_tracker.add_positive(k, phrase_type)
+                    else:
+                        recall_tracker.add_negative(k, 'all')
+                        # for phrase_type in phrase['phrase_type']:
+                        #     recall_tracker.add_negative(k, phrase_type)
+
+        self.results = recall_tracker.report()
+
+        logger.info(self.results)
+
+        return self.results
diff --git a/mmdet/evaluation/metrics/grefcoco_metric.py b/mmdet/evaluation/metrics/grefcoco_metric.py
new file mode 100644
index 00000000000..55cc638c5e4
--- /dev/null
+++ b/mmdet/evaluation/metrics/grefcoco_metric.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+import torch
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger
+
+from mmdet.datasets.api_wrappers import COCO
+from mmdet.registry import METRICS
+from ..functional import bbox_overlaps
+
+
+# refer from https://github.com/henghuiding/gRefCOCO/blob/main/mdetr/datasets/refexp.py # noqa
+@METRICS.register_module()
+class gRefCOCOMetric(BaseMetric):
+    default_prefix: Optional[str] = 'grefcoco'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 metric: str = 'bbox',
+                 iou_thrs: float = 0.5,
+                 thresh_score: float = 0.7,
+                 thresh_f1: float = 1.0,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.metric = metric
+        self.iou_thrs = iou_thrs
+        self.thresh_score = thresh_score
+        self.thresh_f1 = thresh_f1
+
+        with get_local_path(ann_file) as local_path:
+            self.coco = COCO(local_path)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu()
+            result['scores'] = pred['scores'].cpu()
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        correct_image = 0
+        num_image = 0
+        nt = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}
+
+        for result in results:
+            img_id = result['img_id']
+            TP = 0
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id)
+            target = self.coco.loadAnns(ann_ids[0])
+
+            converted_bbox_all = []
+            no_target_flag = False
+            for one_target in target:
+                if one_target['category_id'] == -1:
+                    no_target_flag = True
+                target_bbox = one_target['bbox']
+                converted_bbox = [
+                    target_bbox[0],
+                    target_bbox[1],
+                    target_bbox[2] + target_bbox[0],
+                    target_bbox[3] + target_bbox[1],
+                ]
+                converted_bbox_all.append(
+                    np.array(converted_bbox).reshape(-1, 4))
+            gt_bbox_all = np.concatenate(converted_bbox_all, axis=0)
+
+            idx = result['scores'] >= self.thresh_score
+            filtered_boxes = result['bboxes'][idx]
+
+            iou = bbox_overlaps(filtered_boxes.numpy(), gt_bbox_all)
+            iou = torch.from_numpy(iou)
+
+            num_prediction = filtered_boxes.shape[0]
+            num_gt = gt_bbox_all.shape[0]
+            if no_target_flag:
+                if num_prediction >= 1:
+                    nt['FN'] += 1
+                else:
+                    nt['TP'] += 1
+                if num_prediction >= 1:
+                    f_1 = 0.
+                else:
+                    f_1 = 1.0
+            else:
+                if num_prediction >= 1:
+                    nt['TN'] += 1
+                else:
+                    nt['FP'] += 1
+                for i in range(min(num_prediction, num_gt)):
+                    top_value, top_index = torch.topk(iou.flatten(0, 1), 1)
+                    if top_value < self.iou_thrs:
+                        break
+                    else:
+                        top_index_x = top_index // num_gt
+                        top_index_y = top_index % num_gt
+                        TP += 1
+                        iou[top_index_x[0], :] = 0.0
+                        iou[:, top_index_y[0]] = 0.0
+                FP = num_prediction - TP
+                FN = num_gt - TP
+                f_1 = 2 * TP / (2 * TP + FP + FN)
+
+            if f_1 >= self.thresh_f1:
+                correct_image += 1
+            num_image += 1
+
+        score = correct_image / max(num_image, 1)
+        results = {
+            'F1_score': score,
+            'T_acc': nt['TN'] / (nt['TN'] + nt['FP']),
+            'N_acc': nt['TP'] / (nt['TP'] + nt['FN'])
+        }
+        logger.info(results)
+        return results
diff --git a/mmdet/evaluation/metrics/refexp_metric.py b/mmdet/evaluation/metrics/refexp_metric.py
new file mode 100644
index 00000000000..8bcdf1629b9
--- /dev/null
+++ b/mmdet/evaluation/metrics/refexp_metric.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger
+
+from mmdet.datasets.api_wrappers import COCO
+from mmdet.registry import METRICS
+from ..functional import bbox_overlaps
+
+
+@METRICS.register_module()
+class RefExpMetric(BaseMetric):
+    default_prefix: Optional[str] = 'refexp'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 metric: str = 'bbox',
+                 topk=(1, 5, 10),
+                 iou_thrs: float = 0.5,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.metric = metric
+        self.topk = topk
+        self.iou_thrs = iou_thrs
+
+        with get_local_path(ann_file) as local_path:
+            self.coco = COCO(local_path)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        dataset2score = {
+            'refcoco': {k: 0.0
+                        for k in self.topk},
+            'refcoco+': {k: 0.0
+                         for k in self.topk},
+            'refcocog': {k: 0.0
+                         for k in self.topk},
+        }
+        dataset2count = {'refcoco': 0.0, 'refcoco+': 0.0, 'refcocog': 0.0}
+
+        for result in results:
+            img_id = result['img_id']
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id)
+            assert len(ann_ids) == 1
+            img_info = self.coco.loadImgs(img_id)[0]
+            target = self.coco.loadAnns(ann_ids[0])
+
+            target_bbox = target[0]['bbox']
+            converted_bbox = [
+                target_bbox[0],
+                target_bbox[1],
+                target_bbox[2] + target_bbox[0],
+                target_bbox[3] + target_bbox[1],
+            ]
+            iou = bbox_overlaps(result['bboxes'],
+                                np.array(converted_bbox).reshape(-1, 4))
+            for k in self.topk:
+                if max(iou[:k]) >= self.iou_thrs:
+                    dataset2score[img_info['dataset_name']][k] += 1.0
+            dataset2count[img_info['dataset_name']] += 1.0
+
+        for key, value in dataset2score.items():
+            for k in self.topk:
+                try:
+                    value[k] /= dataset2count[key]
+                except Exception as e:
+                    print(e)
+
+        results = {}
+        mean_precision = 0.0
+        for key, value in dataset2score.items():
+            results[key] = sorted([v for k, v in value.items()])
+            mean_precision += sum(results[key])
+            logger.info(
+                f' Dataset: {key} - Precision @ 1, 5, 10: {results[key]}')
+
+        # `mean_precision` key is used for saving the best checkpoint
+        out_results = {'mean_precision': mean_precision / 9.0}
+
+        for i, k in enumerate(self.topk):
+            out_results[f'refcoco_precision@{k}'] = results['refcoco'][i]
+        for i, k in enumerate(self.topk):
+            out_results[f'refcoco+_precision@{k}'] = results['refcoco+'][i]
+        for i, k in enumerate(self.topk):
+            out_results[f'refcocog_precision@{k}'] = results['refcocog'][i]
+        return out_results
diff --git a/mmdet/models/dense_heads/grounding_dino_head.py b/mmdet/models/dense_heads/grounding_dino_head.py
index 3aced626555..8088322546f 100644
--- a/mmdet/models/dense_heads/grounding_dino_head.py
+++ b/mmdet/models/dense_heads/grounding_dino_head.py
@@ -417,14 +417,21 @@ def _predict_by_feat_single(self,
         max_per_img = self.test_cfg.get('max_per_img', len(cls_score))
         img_shape = img_meta['img_shape']
 
-        cls_score = convert_grounding_to_cls_scores(
-            logits=cls_score.sigmoid()[None],
-            positive_maps=[token_positive_maps])[0]
-        scores, indexes = cls_score.view(-1).topk(max_per_img)
-        num_classes = cls_score.shape[-1]
-        det_labels = indexes % num_classes
-        bbox_index = indexes // num_classes
-        bbox_pred = bbox_pred[bbox_index]
+        if token_positive_maps is not None:
+            cls_score = convert_grounding_to_cls_scores(
+                logits=cls_score.sigmoid()[None],
+                positive_maps=[token_positive_maps])[0]
+            scores, indexes = cls_score.view(-1).topk(max_per_img)
+            num_classes = cls_score.shape[-1]
+            det_labels = indexes % num_classes
+            bbox_index = indexes // num_classes
+            bbox_pred = bbox_pred[bbox_index]
+        else:
+            cls_score = cls_score.sigmoid()
+            scores, _ = cls_score.max(-1)
+            scores, indexes = scores.topk(max_per_img)
+            bbox_pred = bbox_pred[indexes]
+            det_labels = scores.new_zeros(scores.shape, dtype=torch.long)
 
         det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
         det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
diff --git a/mmdet/models/detectors/glip.py b/mmdet/models/detectors/glip.py
index 4011e73d09f..e9729cee8af 100644
--- a/mmdet/models/detectors/glip.py
+++ b/mmdet/models/detectors/glip.py
@@ -79,6 +79,7 @@ def run_ner(caption: str) -> Tuple[list, list]:
     noun_phrases = find_noun_phrases(caption)
     noun_phrases = [remove_punctuation(phrase) for phrase in noun_phrases]
     noun_phrases = [phrase for phrase in noun_phrases if phrase != '']
+    print('noun_phrases:', noun_phrases)
     relevant_phrases = noun_phrases
     labels = noun_phrases
 
@@ -315,8 +316,31 @@ def get_tokens_positive_and_prompts(
         self,
         original_caption: Union[str, list, tuple],
         custom_entities: bool = False,
-        enhanced_text_prompt: Optional[ConfigType] = None
+        enhanced_text_prompt: Optional[ConfigType] = None,
+        tokens_positive: Optional[list] = None,
     ) -> Tuple[dict, str, Tensor, list]:
+        if tokens_positive is not None:
+            if tokens_positive == -1:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                return None, original_caption, None, original_caption
+            else:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                tokenized = self.language_model.tokenizer([original_caption],
+                                                          return_tensors='pt')
+                positive_map_label_to_token, positive_map = \
+                    self.get_positive_map(tokenized, tokens_positive)
+
+                entities = []
+                for token_positive in tokens_positive:
+                    instance_entities = []
+                    for t in token_positive:
+                        instance_entities.append(original_caption[t[0]:t[1]])
+                    entities.append(' / '.join(instance_entities))
+                return positive_map_label_to_token, original_caption, \
+                    positive_map, entities
+
         chunked_size = self.test_cfg.get('chunked_size', -1)
         if not self.training and chunked_size > 0:
             assert isinstance(original_caption,
@@ -469,12 +493,14 @@ def predict(self,
         """
         text_prompts = []
         enhanced_text_prompts = []
+        tokens_positives = []
         for data_samples in batch_data_samples:
             text_prompts.append(data_samples.text)
             if 'caption_prompt' in data_samples:
                 enhanced_text_prompts.append(data_samples.caption_prompt)
             else:
                 enhanced_text_prompts.append(None)
+            tokens_positives.append(data_samples.get('tokens_positive', None))
 
         if 'custom_entities' in batch_data_samples[0]:
             # Assuming that the `custom_entities` flag
@@ -488,15 +514,17 @@ def predict(self,
             # so there is no need to calculate them multiple times.
             _positive_maps_and_prompts = [
                 self.get_tokens_positive_and_prompts(
-                    text_prompts[0], custom_entities, enhanced_text_prompts[0])
+                    text_prompts[0], custom_entities, enhanced_text_prompts[0],
+                    tokens_positives[0])
             ] * len(batch_inputs)
         else:
             _positive_maps_and_prompts = [
                 self.get_tokens_positive_and_prompts(text_prompt,
                                                      custom_entities,
-                                                     enhanced_text_prompt)
-                for text_prompt, enhanced_text_prompt in zip(
-                    text_prompts, enhanced_text_prompts)
+                                                     enhanced_text_prompt,
+                                                     tokens_positive)
+                for text_prompt, enhanced_text_prompt, tokens_positive in zip(
+                    text_prompts, enhanced_text_prompts, tokens_positives)
             ]
 
         token_positive_maps, text_prompts, _, entities = zip(
diff --git a/mmdet/models/detectors/grounding_dino.py b/mmdet/models/detectors/grounding_dino.py
index cc6cccedf29..4ec9d14e634 100644
--- a/mmdet/models/detectors/grounding_dino.py
+++ b/mmdet/models/detectors/grounding_dino.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import copy
 import re
 import warnings
 from typing import Dict, Optional, Tuple, Union
@@ -25,6 +26,20 @@ def clean_label_name(name: str) -> str:
     return name
 
 
+def chunks(lst: list, n: int) -> list:
+    """Yield successive n-sized chunks from lst."""
+    all_ = []
+    for i in range(0, len(lst), n):
+        data_index = lst[i:i + n]
+        all_.append(data_index)
+    counter = 0
+    for i in all_:
+        counter += len(i)
+    assert (counter == len(lst))
+
+    return all_
+
+
 @MODELS.register_module()
 class GroundingDINO(DINO):
     """Implementation of `Grounding DINO: Marrying DINO with Grounded Pre-
@@ -175,7 +190,8 @@ def get_tokens_positive_and_prompts(
         self,
         original_caption: Union[str, list, tuple],
         custom_entities: bool = False,
-        enhanced_text_prompt: Optional[ConfigType] = None
+        enhanced_text_prompt: Optional[ConfigType] = None,
+        tokens_positive: Optional[list] = None,
     ) -> Tuple[dict, str, Tensor, list]:
         """Get the tokens positive and prompts for the caption.
 
@@ -190,14 +206,94 @@ def get_tokens_positive_and_prompts(
             id, which is numbered from 1, to its positive token id.
             The str represents the prompts.
         """
-        tokenized, caption_string, tokens_positive, entities = \
-            self.get_tokens_and_prompts(
-                original_caption, custom_entities, enhanced_text_prompt)
-        positive_map_label_to_token, positive_map = self.get_positive_map(
-            tokenized, tokens_positive)
+        if tokens_positive is not None:
+            if tokens_positive == -1:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                return None, original_caption, None, original_caption
+            else:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                tokenized = self.language_model.tokenizer(
+                    [original_caption],
+                    padding='max_length'
+                    if self.language_model.pad_to_max else 'longest',
+                    return_tensors='pt')
+                positive_map_label_to_token, positive_map = \
+                    self.get_positive_map(tokenized, tokens_positive)
+
+                entities = []
+                for token_positive in tokens_positive:
+                    instance_entities = []
+                    for t in token_positive:
+                        instance_entities.append(original_caption[t[0]:t[1]])
+                    entities.append(' / '.join(instance_entities))
+                return positive_map_label_to_token, original_caption, \
+                    positive_map, entities
+
+        chunked_size = self.test_cfg.get('chunked_size', -1)
+        if not self.training and chunked_size > 0:
+            assert isinstance(original_caption,
+                              (list, tuple)) or custom_entities is True
+            all_output = self.get_tokens_positive_and_prompts_chunked(
+                original_caption, enhanced_text_prompt)
+            positive_map_label_to_token, \
+                caption_string, \
+                positive_map, \
+                entities = all_output
+        else:
+            tokenized, caption_string, tokens_positive, entities = \
+                self.get_tokens_and_prompts(
+                    original_caption, custom_entities, enhanced_text_prompt)
+            positive_map_label_to_token, positive_map = self.get_positive_map(
+                tokenized, tokens_positive)
         return positive_map_label_to_token, caption_string, \
             positive_map, entities
 
+    def get_tokens_positive_and_prompts_chunked(
+            self,
+            original_caption: Union[list, tuple],
+            enhanced_text_prompts: Optional[ConfigType] = None):
+        chunked_size = self.test_cfg.get('chunked_size', -1)
+        original_caption = [clean_label_name(i) for i in original_caption]
+
+        original_caption_chunked = chunks(original_caption, chunked_size)
+        ids_chunked = chunks(
+            list(range(1,
+                       len(original_caption) + 1)), chunked_size)
+
+        positive_map_label_to_token_chunked = []
+        caption_string_chunked = []
+        positive_map_chunked = []
+        entities_chunked = []
+
+        for i in range(len(ids_chunked)):
+            if enhanced_text_prompts is not None:
+                caption_string, tokens_positive = self.to_enhance_text_prompts(
+                    original_caption_chunked[i], enhanced_text_prompts)
+            else:
+                caption_string, tokens_positive = self.to_plain_text_prompts(
+                    original_caption_chunked[i])
+            tokenized = self.language_model.tokenizer([caption_string],
+                                                      return_tensors='pt')
+            if tokenized.input_ids.shape[1] > self.language_model.max_tokens:
+                warnings.warn('Inputting a text that is too long will result '
+                              'in poor prediction performance. '
+                              'Please reduce the --chunked-size.')
+            positive_map_label_to_token, positive_map = self.get_positive_map(
+                tokenized, tokens_positive)
+
+            caption_string_chunked.append(caption_string)
+            positive_map_label_to_token_chunked.append(
+                positive_map_label_to_token)
+            positive_map_chunked.append(positive_map)
+            entities_chunked.append(original_caption_chunked[i])
+
+        return positive_map_label_to_token_chunked, \
+            caption_string_chunked, \
+            positive_map_chunked, \
+            entities_chunked
+
     def forward_transformer(
         self,
         img_feats: Tuple[Tensor],
@@ -316,7 +412,6 @@ def pre_decoder(
 
     def loss(self, batch_inputs: Tensor,
              batch_data_samples: SampleList) -> Union[dict, list]:
-        # TODO: Only open vocabulary tasks are supported for training now.
         text_prompts = [
             data_samples.text for data_samples in batch_data_samples
         ]
@@ -326,34 +421,55 @@ def loss(self, batch_inputs: Tensor,
             for data_samples in batch_data_samples
         ]
 
-        new_text_prompts = []
-        positive_maps = []
-        if len(set(text_prompts)) == 1:
-            # All the text prompts are the same,
-            # so there is no need to calculate them multiple times.
-            tokenized, caption_string, tokens_positive, _ = \
-                self.get_tokens_and_prompts(
-                    text_prompts[0], True)
-            new_text_prompts = [caption_string] * len(batch_inputs)
-            for gt_label in gt_labels:
+        if 'tokens_positive' in batch_data_samples[0]:
+            tokens_positive = [
+                data_samples.tokens_positive
+                for data_samples in batch_data_samples
+            ]
+            positive_maps = []
+            for token_positive, text_prompt, gt_label in zip(
+                    tokens_positive, text_prompts, gt_labels):
+                tokenized = self.language_model.tokenizer(
+                    [text_prompt],
+                    padding='max_length'
+                    if self.language_model.pad_to_max else 'longest',
+                    return_tensors='pt')
                 new_tokens_positive = [
-                    tokens_positive[label] for label in gt_label
+                    token_positive[label.item()] for label in gt_label
                 ]
                 _, positive_map = self.get_positive_map(
                     tokenized, new_tokens_positive)
                 positive_maps.append(positive_map)
+            new_text_prompts = text_prompts
         else:
-            for text_prompt, gt_label in zip(text_prompts, gt_labels):
+            new_text_prompts = []
+            positive_maps = []
+            if len(set(text_prompts)) == 1:
+                # All the text prompts are the same,
+                # so there is no need to calculate them multiple times.
                 tokenized, caption_string, tokens_positive, _ = \
                     self.get_tokens_and_prompts(
-                        text_prompt, True)
-                new_tokens_positive = [
-                    tokens_positive[label] for label in gt_label
-                ]
-                _, positive_map = self.get_positive_map(
-                    tokenized, new_tokens_positive)
-                positive_maps.append(positive_map)
-                new_text_prompts.append(caption_string)
+                        text_prompts[0], True)
+                new_text_prompts = [caption_string] * len(batch_inputs)
+                for gt_label in gt_labels:
+                    new_tokens_positive = [
+                        tokens_positive[label] for label in gt_label
+                    ]
+                    _, positive_map = self.get_positive_map(
+                        tokenized, new_tokens_positive)
+                    positive_maps.append(positive_map)
+            else:
+                for text_prompt, gt_label in zip(text_prompts, gt_labels):
+                    tokenized, caption_string, tokens_positive, _ = \
+                        self.get_tokens_and_prompts(
+                            text_prompt, True)
+                    new_tokens_positive = [
+                        tokens_positive[label] for label in gt_label
+                    ]
+                    _, positive_map = self.get_positive_map(
+                        tokenized, new_tokens_positive)
+                    positive_maps.append(positive_map)
+                    new_text_prompts.append(caption_string)
 
         text_dict = self.language_model(new_text_prompts)
         if self.text_feat_map is not None:
@@ -379,12 +495,14 @@ def loss(self, batch_inputs: Tensor,
     def predict(self, batch_inputs, batch_data_samples, rescale: bool = True):
         text_prompts = []
         enhanced_text_prompts = []
+        tokens_positives = []
         for data_samples in batch_data_samples:
             text_prompts.append(data_samples.text)
             if 'caption_prompt' in data_samples:
                 enhanced_text_prompts.append(data_samples.caption_prompt)
             else:
                 enhanced_text_prompts.append(None)
+            tokens_positives.append(data_samples.get('tokens_positive', None))
 
         if 'custom_entities' in batch_data_samples[0]:
             # Assuming that the `custom_entities` flag
@@ -397,41 +515,88 @@ def predict(self, batch_inputs, batch_data_samples, rescale: bool = True):
             # so there is no need to calculate them multiple times.
             _positive_maps_and_prompts = [
                 self.get_tokens_positive_and_prompts(
-                    text_prompts[0], custom_entities, enhanced_text_prompts[0])
+                    text_prompts[0], custom_entities, enhanced_text_prompts[0],
+                    tokens_positives[0])
             ] * len(batch_inputs)
         else:
             _positive_maps_and_prompts = [
                 self.get_tokens_positive_and_prompts(text_prompt,
                                                      custom_entities,
-                                                     enhanced_text_prompt)
-                for text_prompt, enhanced_text_prompt in zip(
-                    text_prompts, enhanced_text_prompts)
+                                                     enhanced_text_prompt,
+                                                     tokens_positive)
+                for text_prompt, enhanced_text_prompt, tokens_positive in zip(
+                    text_prompts, enhanced_text_prompts, tokens_positives)
             ]
         token_positive_maps, text_prompts, _, entities = zip(
             *_positive_maps_and_prompts)
-        # extract text feats
-        text_dict = self.language_model(list(text_prompts))
-        # text feature map layer
-        if self.text_feat_map is not None:
-            text_dict['embedded'] = self.text_feat_map(text_dict['embedded'])
-
-        for i, data_samples in enumerate(batch_data_samples):
-            data_samples.token_positive_map = token_positive_maps[i]
 
         # image feature extraction
         visual_feats = self.extract_feat(batch_inputs)
 
-        head_inputs_dict = self.forward_transformer(visual_feats, text_dict,
-                                                    batch_data_samples)
-        results_list = self.bbox_head.predict(
-            **head_inputs_dict,
-            rescale=rescale,
-            batch_data_samples=batch_data_samples)
-        for data_sample, pred_instances, entity in zip(batch_data_samples,
-                                                       results_list, entities):
+        if isinstance(text_prompts[0], list):
+            # chunked text prompts, only bs=1 is supported
+            assert len(batch_inputs) == 1
+            count = 0
+            results_list = []
+
+            entities = [[item for lst in entities[0] for item in lst]]
+
+            for b in range(len(text_prompts[0])):
+                text_prompts_once = [text_prompts[0][b]]
+                token_positive_maps_once = token_positive_maps[0][b]
+                text_dict = self.language_model(text_prompts_once)
+                # text feature map layer
+                if self.text_feat_map is not None:
+                    text_dict['embedded'] = self.text_feat_map(
+                        text_dict['embedded'])
+
+                batch_data_samples[
+                    0].token_positive_map = token_positive_maps_once
+
+                head_inputs_dict = self.forward_transformer(
+                    copy.deepcopy(visual_feats), text_dict, batch_data_samples)
+                pred_instances = self.bbox_head.predict(
+                    **head_inputs_dict,
+                    rescale=rescale,
+                    batch_data_samples=batch_data_samples)[0]
+
+                if len(pred_instances) > 0:
+                    pred_instances.labels += count
+                count += len(token_positive_maps_once)
+                results_list.append(pred_instances)
+            results_list = [results_list[0].cat(results_list)]
+            is_rec_tasks = [False] * len(results_list)
+        else:
+            # extract text feats
+            text_dict = self.language_model(list(text_prompts))
+            # text feature map layer
+            if self.text_feat_map is not None:
+                text_dict['embedded'] = self.text_feat_map(
+                    text_dict['embedded'])
+
+            is_rec_tasks = []
+            for i, data_samples in enumerate(batch_data_samples):
+                if token_positive_maps[i] is not None:
+                    is_rec_tasks.append(False)
+                else:
+                    is_rec_tasks.append(True)
+                data_samples.token_positive_map = token_positive_maps[i]
+
+            head_inputs_dict = self.forward_transformer(
+                visual_feats, text_dict, batch_data_samples)
+            results_list = self.bbox_head.predict(
+                **head_inputs_dict,
+                rescale=rescale,
+                batch_data_samples=batch_data_samples)
+
+        for data_sample, pred_instances, entity, is_rec_task in zip(
+                batch_data_samples, results_list, entities, is_rec_tasks):
             if len(pred_instances) > 0:
                 label_names = []
                 for labels in pred_instances.labels:
+                    if is_rec_task:
+                        label_names.append(entity)
+                        continue
                     if labels >= len(entity):
                         warnings.warn(
                             'The unexpected output indicates an issue with '
diff --git a/requirements/multimodal.txt b/requirements/multimodal.txt
index 03fdb17777e..20924eb3ee1 100644
--- a/requirements/multimodal.txt
+++ b/requirements/multimodal.txt
@@ -1,4 +1,5 @@
 fairscale
+jsonlines
 nltk
 pycocoevalcap
 transformers
diff --git a/requirements/optional.txt b/requirements/optional.txt
index 54e5dd647f4..31bdde50bea 100644
--- a/requirements/optional.txt
+++ b/requirements/optional.txt
@@ -1,4 +1,5 @@
 cityscapesscripts
+emoji
 fairscale
 imagecorruptions
 scikit-learn
diff --git a/setup.cfg b/setup.cfg
index a3ff3fa46d2..7ecd4b98a70 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -18,7 +18,7 @@ SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = true
 [codespell]
 skip = *.ipynb,configs/v3det/category_name_13204_v3det_2023_v1.txt
 quiet-level = 3
-ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids,TOOD,tood,ba,warmup,nam,DOTA,dota,conveyer,singed,comittee
+ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids,TOOD,tood,ba,warmup,nam,DOTA,dota,conveyer,singed,comittee,extention,moniter,pres,
 
 [flake8]
 per-file-ignores = mmdet/configs/*: F401,F403,F405
diff --git a/tools/analysis_tools/browse_grounding_dataset.py b/tools/analysis_tools/browse_grounding_dataset.py
new file mode 100644
index 00000000000..43261956faa
--- /dev/null
+++ b/tools/analysis_tools/browse_grounding_dataset.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+import numpy as np
+from mmcv.image import imwrite
+from mmengine.config import Config, DictAction
+from mmengine.registry import init_default_scope
+from mmengine.utils import ProgressBar
+
+from mmdet.registry import DATASETS, VISUALIZERS
+from mmdet.structures.bbox import BaseBoxes
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--output-dir',
+        '-o',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument('--show-num', '-n', type=int, default=30)
+    parser.add_argument('--shuffle', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=float,
+        default=0,
+        help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def draw_all_character(visualizer, characters, w):
+    start_index = 2
+    y_index = 5
+    for char in characters:
+        if isinstance(char, str):
+            visualizer.draw_texts(
+                str(char),
+                positions=np.array([start_index, y_index]),
+                colors=(0, 0, 0),
+                font_families='monospace')
+            start_index += len(char) * 8
+        else:
+            visualizer.draw_texts(
+                str(char[0]),
+                positions=np.array([start_index, y_index]),
+                colors=char[1],
+                font_families='monospace')
+            start_index += len(char[0]) * 8
+
+        if start_index > w - 10:
+            start_index = 2
+            y_index += 15
+
+    drawn_text = visualizer.get_image()
+    return drawn_text
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    assert args.show_num > 0
+
+    # register all modules in mmdet into the registries
+    init_default_scope(cfg.get('default_scope', 'mmdet'))
+
+    dataset = DATASETS.build(cfg.train_dataloader.dataset)
+    visualizer = VISUALIZERS.build(cfg.visualizer)
+    visualizer.dataset_meta = dataset.metainfo
+
+    dataset_index = list(range(len(dataset)))
+    if args.shuffle:
+        import random
+        random.shuffle(dataset_index)
+
+    progress_bar = ProgressBar(len(dataset))
+    for i in dataset_index[:args.show_num]:
+        item = dataset[i]
+        img = item['inputs'].permute(1, 2, 0).numpy()
+        data_sample = item['data_samples'].numpy()
+        gt_instances = data_sample.gt_instances
+        tokens_positive = data_sample.tokens_positive
+
+        gt_labels = gt_instances.labels
+
+        base_name = osp.basename(item['data_samples'].img_path)
+        name, extension = osp.splitext(base_name)
+
+        out_file = osp.join(args.output_dir, name + '_' + str(i) +
+                            extension) if args.output_dir is not None else None
+
+        img = img[..., [2, 1, 0]]  # bgr to rgb
+        gt_bboxes = gt_instances.get('bboxes', None)
+        if gt_bboxes is not None and isinstance(gt_bboxes, BaseBoxes):
+            gt_instances.bboxes = gt_bboxes.tensor
+
+        print(data_sample.text)
+
+        dataset_mode = data_sample.dataset_mode
+        if dataset_mode == 'VG':
+            max_label = int(max(gt_labels) if len(gt_labels) > 0 else 0)
+            palette = np.random.randint(0, 256, size=(max_label + 1, 3))
+            bbox_palette = [tuple(c) for c in palette]
+            # bbox_palette = get_palette('random', max_label + 1)
+            colors = [bbox_palette[label] for label in gt_labels]
+
+            visualizer.set_image(img)
+
+            for label, bbox, color in zip(gt_labels, gt_bboxes, colors):
+                visualizer.draw_bboxes(
+                    bbox, edge_colors=color, face_colors=color, alpha=0.3)
+                visualizer.draw_bboxes(bbox, edge_colors=color, alpha=1)
+
+            drawn_img = visualizer.get_image()
+
+            new_image = np.ones((100, img.shape[1], 3), dtype=np.uint8) * 255
+            visualizer.set_image(new_image)
+
+            gt_tokens_positive = [
+                tokens_positive[label] for label in gt_labels
+            ]
+            split_by_character = [char for char in data_sample.text]
+            characters = []
+            start_index = 0
+            end_index = 0
+            for w in split_by_character:
+                end_index += len(w)
+                is_find = False
+                for i, positive in enumerate(gt_tokens_positive):
+                    for p in positive:
+                        if start_index >= p[0] and end_index <= p[1]:
+                            characters.append([w, colors[i]])
+                            is_find = True
+                            break
+                    if is_find:
+                        break
+                if not is_find:
+                    characters.append([w, (0, 0, 0)])
+                start_index = end_index
+
+            drawn_text = draw_all_character(visualizer, characters,
+                                            img.shape[1])
+            drawn_img = np.concatenate((drawn_img, drawn_text), axis=0)
+        else:
+            gt_labels = gt_instances.labels
+            text = data_sample.text
+            label_names = []
+            for label in gt_labels:
+                label_names.append(text[
+                    tokens_positive[label][0][0]:tokens_positive[label][0][1]])
+            gt_instances.label_names = label_names
+            data_sample.gt_instances = gt_instances
+
+            visualizer.add_datasample(
+                base_name,
+                img,
+                data_sample,
+                draw_pred=False,
+                show=False,
+                wait_time=0,
+                out_file=None)
+            drawn_img = visualizer.get_image()
+
+            new_image = np.ones((100, img.shape[1], 3), dtype=np.uint8) * 255
+            visualizer.set_image(new_image)
+
+            characters = [char for char in text]
+            drawn_text = draw_all_character(visualizer, characters,
+                                            img.shape[1])
+            drawn_img = np.concatenate((drawn_img, drawn_text), axis=0)
+
+        if not args.not_show:
+            visualizer.show(
+                drawn_img, win_name=base_name, wait_time=args.show_interval)
+
+        if out_file is not None:
+            imwrite(drawn_img[..., ::-1], out_file)
+
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/browse_grounding_raw.py b/tools/analysis_tools/browse_grounding_raw.py
new file mode 100644
index 00000000000..4fcf10a032c
--- /dev/null
+++ b/tools/analysis_tools/browse_grounding_raw.py
@@ -0,0 +1,280 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import os.path as osp
+
+import cv2
+import numpy as np
+from mmcv.image import imfrombytes, imwrite
+from mmengine.fileio import get
+from mmengine.structures import InstanceData
+from mmengine.utils import mkdir_or_exist
+
+from mmdet.structures import DetDataSample
+from mmdet.visualization import DetLocalVisualizer
+from mmdet.visualization.palette import _get_adaptive_scales
+
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('data_root')
+    parser.add_argument('ann_file')
+    parser.add_argument('img_prefix')
+    parser.add_argument('--label-map-file', '-m', default=None)
+    parser.add_argument(
+        '--output-dir',
+        '-o',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument('--show-num', '-n', type=int, default=30)
+    parser.add_argument('--shuffle', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=float,
+        default=0,
+        help='the interval of show (s)')
+    args = parser.parse_args()
+    return args
+
+
+def draw_all_character(visualizer, characters, w):
+    start_index = 2
+    y_index = 5
+    for char in characters:
+        if isinstance(char, str):
+            visualizer.draw_texts(
+                str(char),
+                positions=np.array([start_index, y_index]),
+                colors=(0, 0, 0),
+                font_families='monospace')
+            start_index += len(char) * 8
+        else:
+            visualizer.draw_texts(
+                str(char[0]),
+                positions=np.array([start_index, y_index]),
+                colors=char[1],
+                font_families='monospace')
+            start_index += len(char[0]) * 8
+
+        if start_index > w - 10:
+            start_index = 2
+            y_index += 15
+
+    drawn_text = visualizer.get_image()
+    return drawn_text
+
+
+def main():
+    args = parse_args()
+    assert args.show_num > 0
+
+    local_path = osp.join(args.data_root, args.ann_file)
+    with open(local_path, 'r') as f:
+        data_list = [json.loads(line) for line in f]
+
+    dataset_index = list(range(len(data_list)))
+    if args.shuffle:
+        import random
+        random.shuffle(dataset_index)
+
+    if args.label_map_file is not None:
+        label_map_file = osp.join(args.data_root, args.label_map_file)
+        with open(label_map_file, 'r') as file:
+            label_map = json.load(file)
+
+    visualizer = DetLocalVisualizer()
+
+    for i in dataset_index[:args.show_num]:
+        item = data_list[i]
+
+        img_path = osp.join(args.data_root, args.img_prefix, item['filename'])
+        if backend_args is not None:
+            img_bytes = get(img_path, backend_args)
+            img = imfrombytes(img_bytes, flag='color')
+        else:
+            img = cv2.imread(img_path)
+        img = img[..., [2, 1, 0]]  # bgr to rgb
+
+        base_name, extension = osp.splitext(item['filename'])
+
+        out_file = osp.join(args.output_dir, base_name + '_' + str(i) +
+                            extension) if args.output_dir is not None else None
+
+        if args.output_dir is not None:
+            mkdir_or_exist(args.output_dir)
+
+        if 'detection' in item:
+            anno = item['detection']
+
+            instances = [obj for obj in anno['instances']]
+            bboxes = [obj['bbox'] for obj in instances]
+            bbox_labels = [int(obj['label']) for obj in instances]
+            label_names = [label_map[str(label)] for label in bbox_labels]
+
+            data_sample = DetDataSample()
+            instances = InstanceData()
+            instances['bboxes'] = np.array(bboxes).reshape(-1, 4)
+            instances['labels'] = np.array(bbox_labels)
+            instances['label_names'] = label_names
+            data_sample.gt_instances = instances
+
+            visualizer.add_datasample(
+                osp.basename(img_path),
+                img,
+                data_sample,
+                draw_pred=False,
+                show=not args.not_show,
+                wait_time=args.show_interval,
+                out_file=out_file)
+        elif 'grounding' in item:
+            anno = item['grounding']
+            text = anno['caption']
+            regions = anno['regions']
+
+            max_label = len(regions) if len(regions) > 0 else 0
+            palette = np.random.randint(0, 256, size=(max_label + 1, 3))
+            bbox_palette = [tuple(c) for c in palette]
+            # bbox_palette = get_palette('random', max_label + 1)
+            colors = [bbox_palette[label] for label in range(max_label)]
+
+            visualizer.set_image(img)
+
+            gt_tokens_positive = []
+            for i, region in enumerate(regions):
+                bbox = region['bbox']
+                bbox = np.array(bbox).reshape(-1, 4)
+                tokens_positive = region['tokens_positive']
+                gt_tokens_positive.append(tokens_positive)
+                visualizer.draw_bboxes(
+                    bbox,
+                    edge_colors=colors[i],
+                    face_colors=colors[i],
+                    alpha=0.3)
+                visualizer.draw_bboxes(bbox, edge_colors=colors[i], alpha=1)
+
+                if 'score' in region:
+                    areas = (bbox[:, 3] - bbox[:, 1]) * (
+                        bbox[:, 2] - bbox[:, 0])
+                    scales = _get_adaptive_scales(areas)
+                    score = region['score'][0]
+                    score = [str(s) for s in score]
+                    font_sizes = [
+                        int(13 * scales[i]) for i in range(len(scales))
+                    ]
+                    visualizer.draw_texts(
+                        score,
+                        bbox[:, :2].astype(np.int32),
+                        colors=(255, 255, 255),
+                        font_sizes=font_sizes,
+                        bboxes=[{
+                            'facecolor': 'black',
+                            'alpha': 0.8,
+                            'pad': 0.7,
+                            'edgecolor': 'none'
+                        }] * len(bbox))
+
+            drawn_img = visualizer.get_image()
+            new_image = np.ones((100, img.shape[1], 3), dtype=np.uint8) * 255
+            visualizer.set_image(new_image)
+
+            split_by_character = [char for char in text]
+            characters = []
+            start_index = 0
+            end_index = 0
+            for w in split_by_character:
+                end_index += len(w)
+                is_find = False
+                for i, positive in enumerate(gt_tokens_positive):
+                    for p in positive:
+                        if start_index >= p[0] and end_index <= p[1]:
+                            characters.append([w, colors[i]])
+                            is_find = True
+                            break
+                    if is_find:
+                        break
+                if not is_find:
+                    characters.append([w, (0, 0, 0)])
+                start_index = end_index
+
+            drawn_text = draw_all_character(visualizer, characters,
+                                            img.shape[1])
+            drawn_img = np.concatenate((drawn_img, drawn_text), axis=0)
+
+            if not args.not_show:
+                visualizer.show(
+                    drawn_img,
+                    win_name=base_name,
+                    wait_time=args.show_interval)
+
+            if out_file is not None:
+                imwrite(drawn_img[..., ::-1], out_file)
+
+        elif 'referring' in item:
+            referring = item['referring']
+
+            max_label = len(referring) if len(referring) > 0 else 0
+            palette = np.random.randint(0, 256, size=(max_label + 1, 3))
+            bbox_palette = [tuple(c) for c in palette]
+            # bbox_palette = get_palette('random', max_label + 1)
+            colors = [bbox_palette[label] for label in range(max_label)]
+
+            visualizer.set_image(img)
+            phrases = []
+            for i, ref in enumerate(referring):
+                bbox = ref['bbox']
+                phrase = ref['phrase']
+                phrases.append(' // '.join(phrase))
+                bbox = np.array(bbox).reshape(-1, 4)
+
+                visualizer.draw_bboxes(
+                    bbox,
+                    edge_colors=colors[i],
+                    face_colors=colors[i],
+                    alpha=0.3)
+                visualizer.draw_bboxes(bbox, edge_colors=colors[i], alpha=1)
+            drawn_img = visualizer.get_image()
+
+            new_image = np.ones((100, img.shape[1], 3), dtype=np.uint8) * 255
+            visualizer.set_image(new_image)
+
+            start_index = 2
+            y_index = 5
+
+            chunk_size = max(min(img.shape[1] - 400, 70), 50)
+            for i, p in enumerate(phrases):
+                chunk_p = [
+                    p[i:i + chunk_size] for i in range(0, len(p), chunk_size)
+                ]
+                for cp in chunk_p:
+                    visualizer.draw_texts(
+                        cp,
+                        positions=np.array([start_index, y_index]),
+                        colors=colors[i],
+                        font_families='monospace')
+                    y_index += 15
+
+            drawn_text = visualizer.get_image()
+            drawn_img = np.concatenate((drawn_img, drawn_text), axis=0)
+
+            if not args.not_show:
+                visualizer.show(
+                    drawn_img,
+                    win_name=base_name,
+                    wait_time=args.show_interval)
+
+            if out_file is not None:
+                imwrite(drawn_img[..., ::-1], out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/dataset_converters/coco2odvg.py b/tools/dataset_converters/coco2odvg.py
new file mode 100644
index 00000000000..3cd2b044405
--- /dev/null
+++ b/tools/dataset_converters/coco2odvg.py
@@ -0,0 +1,344 @@
+import argparse
+import json
+import os.path
+
+import jsonlines
+from pycocotools.coco import COCO
+from tqdm import tqdm
+
+id_map = {
+    0: 1,
+    1: 2,
+    2: 3,
+    3: 4,
+    4: 5,
+    5: 6,
+    6: 7,
+    7: 8,
+    8: 9,
+    9: 10,
+    10: 11,
+    11: 13,
+    12: 14,
+    13: 15,
+    14: 16,
+    15: 17,
+    16: 18,
+    17: 19,
+    18: 20,
+    19: 21,
+    20: 22,
+    21: 23,
+    22: 24,
+    23: 25,
+    24: 27,
+    25: 28,
+    26: 31,
+    27: 32,
+    28: 33,
+    29: 34,
+    30: 35,
+    31: 36,
+    32: 37,
+    33: 38,
+    34: 39,
+    35: 40,
+    36: 41,
+    37: 42,
+    38: 43,
+    39: 44,
+    40: 46,
+    41: 47,
+    42: 48,
+    43: 49,
+    44: 50,
+    45: 51,
+    46: 52,
+    47: 53,
+    48: 54,
+    49: 55,
+    50: 56,
+    51: 57,
+    52: 58,
+    53: 59,
+    54: 60,
+    55: 61,
+    56: 62,
+    57: 63,
+    58: 64,
+    59: 65,
+    60: 67,
+    61: 70,
+    62: 72,
+    63: 73,
+    64: 74,
+    65: 75,
+    66: 76,
+    67: 77,
+    68: 78,
+    69: 79,
+    70: 80,
+    71: 81,
+    72: 82,
+    73: 84,
+    74: 85,
+    75: 86,
+    76: 87,
+    77: 88,
+    78: 89,
+    79: 90
+}
+key_list_coco = list(id_map.keys())
+val_list_coco = list(id_map.values())
+key_list_o365 = [i for i in range(365)]
+val_list_o365 = [i for i in range(1, 366)]
+key_list_v3det = [i for i in range(13204)]
+val_list_v3det = [i for i in range(1, 13205)]
+
+
+def dump_coco_label_map(args):
+    ori_map = {
+        '1': 'person',
+        '2': 'bicycle',
+        '3': 'car',
+        '4': 'motorcycle',
+        '5': 'airplane',
+        '6': 'bus',
+        '7': 'train',
+        '8': 'truck',
+        '9': 'boat',
+        '10': 'traffic light',
+        '11': 'fire hydrant',
+        '13': 'stop sign',
+        '14': 'parking meter',
+        '15': 'bench',
+        '16': 'bird',
+        '17': 'cat',
+        '18': 'dog',
+        '19': 'horse',
+        '20': 'sheep',
+        '21': 'cow',
+        '22': 'elephant',
+        '23': 'bear',
+        '24': 'zebra',
+        '25': 'giraffe',
+        '27': 'backpack',
+        '28': 'umbrella',
+        '31': 'handbag',
+        '32': 'tie',
+        '33': 'suitcase',
+        '34': 'frisbee',
+        '35': 'skis',
+        '36': 'snowboard',
+        '37': 'sports ball',
+        '38': 'kite',
+        '39': 'baseball bat',
+        '40': 'baseball glove',
+        '41': 'skateboard',
+        '42': 'surfboard',
+        '43': 'tennis racket',
+        '44': 'bottle',
+        '46': 'wine glass',
+        '47': 'cup',
+        '48': 'fork',
+        '49': 'knife',
+        '50': 'spoon',
+        '51': 'bowl',
+        '52': 'banana',
+        '53': 'apple',
+        '54': 'sandwich',
+        '55': 'orange',
+        '56': 'broccoli',
+        '57': 'carrot',
+        '58': 'hot dog',
+        '59': 'pizza',
+        '60': 'donut',
+        '61': 'cake',
+        '62': 'chair',
+        '63': 'couch',
+        '64': 'potted plant',
+        '65': 'bed',
+        '67': 'dining table',
+        '70': 'toilet',
+        '72': 'tv',
+        '73': 'laptop',
+        '74': 'mouse',
+        '75': 'remote',
+        '76': 'keyboard',
+        '77': 'cell phone',
+        '78': 'microwave',
+        '79': 'oven',
+        '80': 'toaster',
+        '81': 'sink',
+        '82': 'refrigerator',
+        '84': 'book',
+        '85': 'clock',
+        '86': 'vase',
+        '87': 'scissors',
+        '88': 'teddy bear',
+        '89': 'hair drier',
+        '90': 'toothbrush'
+    }
+    new_map = {}
+    for key, value in ori_map.items():
+        label = int(key)
+        ind = val_list_coco.index(label)
+        label_trans = key_list_coco[ind]
+        new_map[label_trans] = value
+    if args.output is None:
+        output = os.path.dirname(args.input) + '/coco2017_label_map.json'
+    else:
+        output = os.path.dirname(args.output) + '/coco2017_label_map.json'
+    with open(output, 'w') as f:
+        json.dump(new_map, f)
+
+
+def dump_o365v1_label_map(args):
+    with open(args.input, 'r') as f:
+        j = json.load(f)
+    o_dict = {}
+    for category in j['categories']:
+        index = str(int(category['id']) - 1)
+        name = category['name']
+        o_dict[index] = name
+    if args.output is None:
+        output = os.path.dirname(args.input) + '/o365v1_label_map.json'
+    else:
+        output = os.path.dirname(args.output) + '/o365v1_label_map.json'
+    with open(output, 'w') as f:
+        json.dump(o_dict, f)
+
+
+def dump_o365v2_label_map(args):
+    with open(args.input, 'r') as f:
+        j = json.load(f)
+    o_dict = {}
+    for category in j['categories']:
+        index = str(int(category['id']) - 1)
+        name = category['name']
+        o_dict[index] = name
+    if args.output is None:
+        output = os.path.dirname(args.input) + '/o365v2_label_map.json'
+    else:
+        output = os.path.dirname(args.output) + '/o365v2_label_map.json'
+    with open(output, 'w') as f:
+        json.dump(o_dict, f)
+
+
+def dump_v3det_label_map(args):
+    with open(args.input, 'r') as f:
+        j = json.load(f)
+    o_dict = {}
+    for category in j['categories']:
+        index = str(int(category['id']) - 1)
+        name = category['name']
+        o_dict[index] = name
+    if args.output is None:
+        output = os.path.dirname(args.input) + '/v3det_2023_v1_label_map.json'
+    else:
+        output = os.path.dirname(args.output) + '/v3det_2023_v1_label_map.json'
+    with open(output, 'w') as f:
+        json.dump(o_dict, f)
+
+
+def coco2odvg(args):
+    coco = COCO(args.input)
+    cats = coco.loadCats(coco.getCatIds())
+    nms = {cat['id']: cat['name'] for cat in cats}
+    metas = []
+    if args.output is None:
+        out_path = args.input[:-5] + '_od.json'
+    else:
+        out_path = args.output
+
+    if args.dataset == 'coco':
+        key_list = key_list_coco
+        val_list = val_list_coco
+        dump_coco_label_map(args)
+    elif args.dataset == 'o365v1':
+        key_list = key_list_o365
+        val_list = val_list_o365
+        dump_o365v1_label_map(args)
+    elif args.dataset == 'o365v2':
+        key_list = key_list_o365
+        val_list = val_list_o365
+        dump_o365v2_label_map(args)
+    elif args.dataset == 'v3det':
+        key_list = key_list_v3det
+        val_list = val_list_v3det
+        dump_v3det_label_map(args)
+
+    for img_id, img_info in tqdm(coco.imgs.items()):
+        # missing images
+        if args.dataset == 'o365v2' and img_id in [908726, 320532, 320534]:
+            print(img_info['file_name'])
+            continue
+        if args.dataset == 'o365v1' and img_id in [6, 19, 23]:
+            print(img_info['file_name'])
+            continue
+
+        if args.dataset == 'o365v2':
+            file_name = img_info['file_name']
+            if file_name.startswith('images/v2/'):
+                file_name = file_name.replace('images/v2/', '')
+            elif file_name.startswith('images/v1/'):
+                file_name = file_name.replace('images/v1/', '')
+            img_info['file_name'] = file_name
+
+        ann_ids = coco.getAnnIds(imgIds=img_id)
+        instance_list = []
+        for ann_id in ann_ids:
+            ann = coco.anns[ann_id]
+
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+
+            if ann.get('iscrowd', False):
+                continue
+
+            bbox_xyxy = [x1, y1, x1 + w, y1 + h]
+            label = ann['category_id']
+            category = nms[label]
+            ind = val_list.index(label)
+            label_trans = key_list[ind]
+            instance_list.append({
+                'bbox': bbox_xyxy,
+                'label': label_trans,
+                'category': category
+            })
+        metas.append({
+            'filename': img_info['file_name'],
+            'height': img_info['height'],
+            'width': img_info['width'],
+            'detection': {
+                'instances': instance_list
+            }
+        })
+
+    with jsonlines.open(out_path, mode='w') as writer:
+        writer.write_all(metas)
+
+    print('save to {}'.format(out_path))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('coco to odvg format.', add_help=True)
+    parser.add_argument('input', type=str, help='input list name')
+    parser.add_argument('--output', '-o', type=str, help='input list name')
+    parser.add_argument(
+        '--dataset',
+        '-d',
+        required=True,
+        type=str,
+        choices=['coco', 'o365v1', 'o365v2', 'v3det'],
+    )
+    args = parser.parse_args()
+
+    coco2odvg(args)
diff --git a/tools/dataset_converters/exclude_cocotrain2017_from_refcoco.py b/tools/dataset_converters/exclude_cocotrain2017_from_refcoco.py
new file mode 100644
index 00000000000..7de2a9ec4e2
--- /dev/null
+++ b/tools/dataset_converters/exclude_cocotrain2017_from_refcoco.py
@@ -0,0 +1,110 @@
+import argparse
+import json
+import os.path as osp
+
+import mmengine
+from pycocotools.coco import COCO
+
+
+def diff_image_id(coco2017_train_ids, ref_ids):
+    set1 = set(coco2017_train_ids)
+    set2 = set(ref_ids)
+    intersection = set1.intersection(set2)
+    result = set1 - intersection
+    return result
+
+
+def gen_new_json(coco2017_train_path, json_data, coco2017_train_ids):
+    coco = COCO(coco2017_train_path)
+    new_json_data = {
+        'info': json_data['info'],
+        'licenses': json_data['licenses'],
+        'categories': json_data['categories'],
+        'images': [],
+        'annotations': []
+    }
+
+    for id in coco2017_train_ids:
+        ann_ids = coco.getAnnIds(imgIds=[id])
+        img_ann_info = coco.loadAnns(ann_ids)
+        img_info = coco.loadImgs([id])[0]
+
+        new_json_data['images'].append(img_info)
+        new_json_data['annotations'].extend(img_ann_info)
+    return new_json_data
+
+
+# coco2017 val and final_mixed_train.json have no intersection,
+# so deduplication is not necessary.
+
+# coco2017 val and datasets like refcoco based on coco2014 train
+# have no intersection, so deduplication is not necessary.
+
+
+# coco2017 train and datasets like refcoco based on coco2014
+# train have overlapping annotations in the validation set,
+# so deduplication is required.
+def exclude_coco(args):
+    with open(args.coco2017_train, 'r') as f:
+        coco2017_train = json.load(f)
+    coco2017_train_ids = [train['id'] for train in coco2017_train['images']]
+    orig_len = len(coco2017_train_ids)
+
+    with open(osp.join(args.mdetr_anno_dir, 'finetune_refcoco_val.json'),
+              'r') as f:
+        refcoco_ann = json.load(f)
+    refcoco_ids = [refcoco['original_id'] for refcoco in refcoco_ann['images']]
+    coco2017_train_ids = diff_image_id(coco2017_train_ids, refcoco_ids)
+
+    with open(
+            osp.join(args.mdetr_anno_dir, 'finetune_refcoco+_val.json'),
+            'r') as f:
+        refcoco_plus_ann = json.load(f)
+    refcoco_plus_ids = [
+        refcoco['original_id'] for refcoco in refcoco_plus_ann['images']
+    ]
+    coco2017_train_ids = diff_image_id(coco2017_train_ids, refcoco_plus_ids)
+
+    with open(
+            osp.join(args.mdetr_anno_dir, 'finetune_refcocog_val.json'),
+            'r') as f:
+        refcocog_ann = json.load(f)
+    refcocog_ids = [
+        refcoco['original_id'] for refcoco in refcocog_ann['images']
+    ]
+    coco2017_train_ids = diff_image_id(coco2017_train_ids, refcocog_ids)
+
+    with open(
+            osp.join(args.mdetr_anno_dir, 'finetune_grefcoco_val.json'),
+            'r') as f:
+        grefcoco_ann = json.load(f)
+    grefcoco_ids = [
+        refcoco['original_id'] for refcoco in grefcoco_ann['images']
+    ]
+    coco2017_train_ids = diff_image_id(coco2017_train_ids, grefcoco_ids)
+
+    coco2017_train_ids = list(coco2017_train_ids)
+    print(
+        'remove {} images from coco2017_train'.format(orig_len -
+                                                      len(coco2017_train_ids)))
+
+    new_json_data = gen_new_json(args.coco2017_train, coco2017_train,
+                                 coco2017_train_ids)
+    if args.out_ann is None:
+        out_ann = osp.dirname(
+            args.coco2017_train) + '/instances_train2017_norefval.json'
+        mmengine.dump(new_json_data, out_ann)
+        print('save new json to {}'.format(out_ann))
+    else:
+        mmengine.dump(new_json_data, args.out_ann)
+        print('save new json to {}'.format(args.out_ann))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('coco to odvg format.', add_help=True)
+    parser.add_argument('mdetr_anno_dir', type=str)
+    parser.add_argument('coco2017_train', type=str)
+    parser.add_argument('--out-ann', '-o', type=str)
+    args = parser.parse_args()
+
+    exclude_coco(args)
diff --git a/tools/dataset_converters/extract_coco_from_mixed.py b/tools/dataset_converters/extract_coco_from_mixed.py
new file mode 100644
index 00000000000..d4777b0fd07
--- /dev/null
+++ b/tools/dataset_converters/extract_coco_from_mixed.py
@@ -0,0 +1,45 @@
+import argparse
+import os.path as osp
+
+import mmengine
+from pycocotools.coco import COCO
+
+
+def extract_coco(args):
+    coco = COCO(args.mixed_ann)
+
+    json_data = mmengine.load(args.mixed_ann)
+    new_json_data = {
+        'info': json_data['info'],
+        'licenses': json_data['licenses'],
+        'categories': json_data['categories'],
+        'images': [],
+        'annotations': []
+    }
+    del json_data
+
+    img_ids = coco.getImgIds()
+    for img_id in img_ids:
+        img_info = coco.loadImgs([img_id])[0]
+        if img_info['data_source'] == 'coco':
+            new_json_data['images'].append(img_info)
+            ann_ids = coco.getAnnIds(imgIds=[img_id])
+            img_ann_info = coco.loadAnns(ann_ids)
+            new_json_data['annotations'].extend(img_ann_info)
+    if args.out_ann is None:
+        out_ann = osp.dirname(
+            args.mixed_ann) + '/final_mixed_train_only_coco.json'
+        mmengine.dump(new_json_data, out_ann)
+        print('save new json to {}'.format(out_ann))
+    else:
+        mmengine.dump(new_json_data, args.out_ann)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        'split mixed goldg to coco.', add_help=True)
+    parser.add_argument('mixed_ann', type=str)
+    parser.add_argument('--out-ann', '-o', type=str)
+    args = parser.parse_args()
+
+    extract_coco(args)
diff --git a/tools/dataset_converters/fix_o365_names.py b/tools/dataset_converters/fix_o365_names.py
new file mode 100644
index 00000000000..fa947bf9c9b
--- /dev/null
+++ b/tools/dataset_converters/fix_o365_names.py
@@ -0,0 +1,40 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import argparse
+import copy
+import json
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--ann',
+        default='data/objects365v2/annotations/zhiyuan_objv2_train.json')
+    parser.add_argument(
+        '--fix_name_map',
+        default='tools/dataset_converters/zhiyuan_objv2_train_names_fix.csv')
+    args = parser.parse_args()
+
+    new_names = {}
+    old_names = {}
+    with open(args.fix_name_map, 'r') as f:
+        for line in f:
+            tmp = line.strip().split(',')
+            old_names[int(tmp[0])] = tmp[1]
+            new_names[int(tmp[0])] = tmp[2]
+    data = json.load(open(args.ann, 'r'))
+
+    cat_info = copy.deepcopy(data['categories'])
+
+    for x in cat_info:
+        if old_names[x['id']].strip() != x['name'].strip():
+            print('{} {} {}'.format(x, old_names[x['id']], new_names[x['id']]))
+            import pdb
+
+            pdb.set_trace()
+        if old_names[x['id']] != new_names[x['id']]:
+            print('Renaming', x['id'], x['name'], new_names[x['id']])
+            x['name'] = new_names[x['id']]
+
+    data['categories'] = cat_info
+    out_name = args.ann[:-5] + '_fixname.json'
+    print('Saving to', out_name)
+    json.dump(data, open(out_name, 'w'))
diff --git a/tools/dataset_converters/goldg2odvg.py b/tools/dataset_converters/goldg2odvg.py
new file mode 100644
index 00000000000..15dde2baff6
--- /dev/null
+++ b/tools/dataset_converters/goldg2odvg.py
@@ -0,0 +1,136 @@
+import argparse
+
+import jsonlines
+from pycocotools.coco import COCO
+from tqdm import tqdm
+
+
+def _has_only_empty_bbox(anno):
+    return all(any(o <= 1 for o in obj['bbox'][2:]) for obj in anno)
+
+
+def has_valid_annotation(anno):
+    # if it's empty, there is no annotation
+    if len(anno) == 0:
+        return False
+    # if all boxes have close to zero area, there is no annotation
+    if _has_only_empty_bbox(anno):
+        return False
+    return True
+
+
+def goldg2odvg(args):
+    coco = COCO(args.input)
+    ids = list(sorted(coco.imgs.keys()))
+
+    out_results = []
+    for img_id in tqdm(ids):
+        if isinstance(img_id, str):
+            ann_ids = coco.getAnnIds(imgIds=[img_id], iscrowd=0)
+        else:
+            ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=0)
+        annos = coco.loadAnns(ann_ids)
+        if not has_valid_annotation(annos):
+            continue
+
+        img_info = coco.loadImgs(img_id)[0]
+        file_name = img_info['file_name']
+        caption = img_info['caption']
+
+        regions = {}
+
+        for anno in annos:
+            box = anno['bbox']
+            tokens_positive = anno['tokens_positive']
+            x1, y1, w, h = box
+            inter_w = max(0, min(x1 + w, int(img_info['width'])) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, int(img_info['height'])) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if anno['area'] <= 0 or w < 1 or h < 1:
+                continue
+
+            if anno.get('iscrowd', False):
+                continue
+            bbox_xyxy = [
+                x1, y1,
+                min(x1 + w, int(img_info['width'])),
+                min(y1 + h, int(img_info['height']))
+            ]
+
+            tokens_positive = sorted(tokens_positive, key=lambda x: x[0])
+
+            phrase = []
+            pre_end_index = -10
+            for token in tokens_positive:
+                start_index = token[0]
+                end_index = token[1]
+                if pre_end_index + 1 == start_index:
+                    if caption[token[0] - 1] == ' ':
+                        phrase[
+                            -1] = phrase[-1] + ' ' + caption[token[0]:token[1]]
+                    else:
+                        phrase.append(caption[token[0]:token[1]])
+                else:
+                    phrase.append(caption[token[0]:token[1]])
+                pre_end_index = end_index
+
+            key = ' '.join(phrase)
+
+            if key not in regions:
+                regions[key] = {
+                    'bbox': bbox_xyxy,
+                    'phrase': phrase,
+                    'tokens_positive': tokens_positive
+                }
+            else:
+                old_box = regions[key]['bbox']
+                if isinstance(old_box[0], list):
+                    old_box.append(bbox_xyxy)
+                else:
+                    old_box = [old_box, bbox_xyxy]
+
+                regions[key]['bbox'] = old_box
+
+        out_dict = {
+            'filename': file_name,
+            'height': int(img_info['height']),
+            'width': int(img_info['width']),
+            'grounding': {
+                'caption': caption
+            }
+        }
+
+        region_list = []
+        for key, value in regions.items():
+            phrase = value['phrase']
+            if len(phrase) == 1:
+                phrase = phrase[0]
+            region_list.append({
+                'bbox': value['bbox'],
+                'phrase': phrase,
+                'tokens_positive': value['tokens_positive']
+            })
+        out_dict['grounding']['regions'] = region_list
+        out_results.append(out_dict)
+
+    if args.out_ann is None:
+        out_path = args.input[:-5] + '_vg.json'
+    else:
+        out_path = args.out_ann
+
+    with jsonlines.open(out_path, mode='w') as writer:
+        writer.write_all(out_results)
+    print(f'save to {out_path}')
+
+
+# goldg+: final_mixed_train_no_coco.json +
+# final_flickr_separateGT_train.json +
+# final_mixed_train_only_coco.json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('goldg to odvg format.', add_help=True)
+    parser.add_argument('input', type=str, help='input list name')
+    parser.add_argument('--out-ann', '-o', type=str)
+    args = parser.parse_args()
+
+    goldg2odvg(args)
diff --git a/tools/dataset_converters/grit2odvg.py b/tools/dataset_converters/grit2odvg.py
new file mode 100644
index 00000000000..3d1c6d1a5e7
--- /dev/null
+++ b/tools/dataset_converters/grit2odvg.py
@@ -0,0 +1,189 @@
+import argparse
+import json
+import multiprocessing
+import os
+import os.path as osp
+
+import emoji
+import jsonlines
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+is_debug = False
+
+
+def is_valid_caption(caption, rules={'↙️', '[CLS]', '[SEP]'}):
+    check_anno = caption.strip(
+    )[:-1]  # Remove the ending delimiter from the caption.
+    for ch in rules:
+        if ch in check_anno:
+            return False
+    return True
+
+
+def process_one_file(anno_file, result_queue):
+    print('processing', anno_file)
+    with open(anno_file, 'r') as f:
+        metas = json.load(f)
+
+    results = []
+    for meta in metas:
+        # print('============================')
+        file_name = meta['key'][0:5] + '/' + meta['key'] + '.jpg'
+        file_name = osp.join('images', file_name)
+
+        h = meta['height']
+        w = meta['width']
+
+        caption = meta['caption']
+        # Weird captions are filtered out from the beginning.
+        if not is_valid_caption(caption):
+            if is_debug:
+                print('=====caption filtered====', caption)
+            continue
+
+        # Captions exceeding 240 tokens are filtered out,
+        # where 240 is an empirical value.
+        tokenized = tokenizer([caption], return_tensors='pt')
+        if tokenized.input_ids.shape[1] >= 240:
+            if is_debug:
+                print('=====token filtered====', caption)
+            continue
+
+        ref_exps = meta['ref_exps']
+        ref_captions = [i[0:2] for i in ref_exps]
+        ref_token_positives = [i[0:2] for i in ref_exps]
+        ref_captions = [caption[int(i[0]):int(i[1])] for i in ref_captions]
+        ref_boxes = [i[2:6] for i in ref_exps]
+
+        regions = {}
+        for bbox, ref_caption, tokens_positive in zip(ref_boxes, ref_captions,
+                                                      ref_token_positives):
+            #  If the current reference includes special delimiters,
+            #  it will be filtered out.
+            if not is_valid_caption(
+                    caption, rules={'.', '？', ' ', "\'", "\""}):
+                if is_debug:
+                    print('=====ref filtered====', caption)
+                continue
+            # If the current reference contains non-ASCII characters,
+            # it will be filtered out.
+            if not str.isascii(caption):
+                if is_debug:
+                    print('=====ref filtered====', caption)
+                continue
+            # If the current reference includes non-ASCII characters,
+            # it will be filtered out.
+            if emoji.emoji_count(caption):
+                if is_debug:
+                    print('=====ref filtered====', caption)
+                continue
+
+            box = [
+                round(bbox[0] * w, 3),
+                round(bbox[1] * h, 3),
+                round((bbox[2]) * w, 3),
+                round((bbox[3]) * h, 3)
+            ]
+            x1, y1, x2, y2 = box
+            inter_w = max(0, min(x1 + w, int(w)) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, int(h)) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                if is_debug:
+                    print('=====wh filtered====', box)
+                continue
+            if w <= 1 or h <= 1:
+                if is_debug:
+                    print('=====area filtered====', box)
+                continue
+
+            if ref_caption not in regions:
+                regions[ref_caption] = {
+                    'bbox':
+                    box,
+                    'phrase':
+                    ref_caption,
+                    'tokens_positive':
+                    [[int(tokens_positive[0]),
+                      int(tokens_positive[1])]],
+                }
+            else:
+                old_box = regions[ref_caption]['bbox']
+                if isinstance(old_box[0], list):
+                    old_box.append(box)
+                else:
+                    old_box = [old_box, box]
+                regions[ref_caption]['bbox'] = old_box
+
+        if len(regions) > 0:
+            print('caption: ', caption)
+            print('regions', regions)
+        else:
+            if is_debug:
+                print('caption: ', caption)
+                print('regions', regions)
+
+        if len(regions) == 0:
+            continue
+
+        out_dict = {
+            'filename': file_name,
+            'height': int(h),
+            'width': int(w),
+            'grounding': {
+                'caption': caption
+            }
+        }
+
+        region_list = []
+        for key, value in regions.items():
+            phrase = value['phrase']
+            if len(phrase) == 1:
+                phrase = phrase[0]
+            region_list.append({
+                'bbox': value['bbox'],
+                'phrase': phrase,
+                'tokens_positive': value['tokens_positive']
+            })
+        out_dict['grounding']['regions'] = region_list
+        print(out_dict)
+        results.append(out_dict)
+    result_queue.put(results)
+
+
+def grit2odvg(args):
+    annotations_dir = osp.join(args.data_root, 'annotations')
+    annos_files = [
+        osp.join(annotations_dir, anno) for anno in os.listdir(annotations_dir)
+        if anno.endswith('.json') and not anno.endswith('vg.json')
+    ]
+
+    annos_files = annos_files[:2]
+
+    manager = multiprocessing.Manager()
+    result_queue = manager.Queue()
+    pool = multiprocessing.Pool(processes=min(len(annos_files), 16))
+
+    for anno_file in annos_files:
+        pool.apply_async(process_one_file, args=(anno_file, result_queue))
+
+    pool.close()
+    pool.join()
+
+    out_datas = []
+    while not result_queue.empty():
+        out_datas.extend(result_queue.get())
+
+    out_path = osp.join(args.data_root, 'grit20m_vg.json')
+    with jsonlines.open(out_path, mode='w') as writer:
+        writer.write_all(out_datas)
+    print('save to ', out_path)
+    print('total img: ', len(out_datas))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('grit to odvg format.', add_help=True)
+    parser.add_argument('data_root', type=str, help='input dir name')
+    args = parser.parse_args()
+
+    grit2odvg(args)
diff --git a/tools/dataset_converters/lvis2odvg.py b/tools/dataset_converters/lvis2odvg.py
new file mode 100644
index 00000000000..ce0c4381b35
--- /dev/null
+++ b/tools/dataset_converters/lvis2odvg.py
@@ -0,0 +1,98 @@
+import argparse
+import json
+import os.path
+
+import jsonlines
+from lvis import LVIS
+from tqdm import tqdm
+
+key_list_lvis = [i for i in range(1203)]
+val_list_lvis = [i for i in range(1, 1204)]
+
+
+def dump_lvis_label_map(args):
+    with open(args.input, 'r') as f:
+        j = json.load(f)
+    o_dict = {}
+    for category in j['categories']:
+        index = str(int(category['id']) - 1)
+        name = category['name']
+        o_dict[index] = name
+    if args.output is None:
+        output = os.path.dirname(args.input) + '/lvis_v1_label_map.json'
+    else:
+        output = os.path.dirname(args.output) + '/lvis_v1_label_map.json'
+    with open(output, 'w') as f:
+        json.dump(o_dict, f)
+
+
+def lvis2odvg(args):
+    lvis = LVIS(args.input)
+    cats = lvis.load_cats(lvis.get_cat_ids())
+    nms = {cat['id']: cat['name'] for cat in cats}
+    metas = []
+    if args.output is None:
+        out_path = args.input[:-5] + '_od.json'
+    else:
+        out_path = args.output
+
+    key_list = key_list_lvis
+    val_list = val_list_lvis
+    dump_lvis_label_map(args)
+
+    for img_id, img_info in tqdm(lvis.imgs.items()):
+        file_name = img_info['coco_url'].replace(
+            'http://images.cocodataset.org/', '')
+        ann_ids = lvis.get_ann_ids(img_ids=[img_id])
+        raw_ann_info = lvis.load_anns(ann_ids)
+        instance_list = []
+        for ann in raw_ann_info:
+            if ann.get('ignore', False):
+                print(f'invalid ignore box of {ann}')
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                print(f'invalid wh box of {ann}')
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                print(f'invalid area box of {ann}, '
+                      f'w={img_info["width"]}, h={img_info["height"]}')
+                continue
+
+            if ann.get('iscrowd', False):
+                print(f'invalid iscrowd box of {ann}')
+                continue
+
+            bbox_xyxy = [x1, y1, x1 + w, y1 + h]
+            label = ann['category_id']
+            category = nms[label]
+            ind = val_list.index(label)
+            label_trans = key_list[ind]
+            instance_list.append({
+                'bbox': bbox_xyxy,
+                'label': label_trans,
+                'category': category
+            })
+        metas.append({
+            'filename': file_name,
+            'height': img_info['height'],
+            'width': img_info['width'],
+            'detection': {
+                'instances': instance_list
+            }
+        })
+
+    with jsonlines.open(out_path, mode='w') as writer:
+        writer.write_all(metas)
+
+    print('save to {}'.format(out_path))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('lvis to odvg format.', add_help=True)
+    parser.add_argument('input', type=str, help='input list name')
+    parser.add_argument('--output', '-o', type=str, help='input list name')
+    args = parser.parse_args()
+    lvis2odvg(args)
diff --git a/tools/dataset_converters/objects365_v2_names_fix.csv b/tools/dataset_converters/objects365_v2_names_fix.csv
new file mode 100644
index 00000000000..33b0aa946c6
--- /dev/null
+++ b/tools/dataset_converters/objects365_v2_names_fix.csv
@@ -0,0 +1,365 @@
+1,Person,Person
+2,Sneakers,Sneakers
+3,Chair,Chair
+4,Other Shoes,Other Shoes
+5,Hat,Hat
+6,Car,Car
+7,Lamp,Lamp
+8,Glasses,Glasses
+9,Bottle,Bottle
+10,Desk,Desk
+11,Cup,Cup
+12,Street Lights,Street Lights
+13,Cabinet/shelf,Cabinet/shelf
+14,Handbag/Satchel,Handbag/Satchel
+15,Bracelet,Bracelet
+16,Plate,Plate
+17,Picture/Frame,Picture/Frame
+18,Helmet,Helmet
+19,Book,Book
+20,Gloves,Gloves
+21,Storage box,Storage box
+22,Boat,Boat
+23,Leather Shoes,Leather Shoes
+24,Flower,Flower
+25,Bench,Bench
+26,Potted Plant,Potted Plant
+27,Bowl/Basin,Bowl/Basin
+28,Flag,Flag
+29,Pillow,Pillow
+30,Boots,Boots
+31,Vase,Vase
+32,Microphone,Microphone
+33,Necklace,Necklace
+34,Ring,Ring
+35,SUV,SUV
+36,Wine Glass,Wine Glass
+37,Belt,Belt
+38,Moniter/TV,Monitor/TV
+39,Backpack,Backpack
+40,Umbrella,Umbrella
+41,Traffic Light,Traffic Light
+42,Speaker,Speaker
+43,Watch,Watch
+44,Tie,Tie
+45,Trash bin Can,Trash bin Can
+46,Slippers,Slippers
+47,Bicycle,Bicycle
+48,Stool,Stool
+49,Barrel/bucket,Barrel/bucket
+50,Van,Van
+51,Couch,Couch
+52,Sandals,Sandals
+53,Bakset,Basket
+54,Drum,Drum
+55,Pen/Pencil,Pen/Pencil
+56,Bus,Bus
+57,Wild Bird,Wild Bird
+58,High Heels,High Heels
+59,Motorcycle,Motorcycle
+60,Guitar,Guitar
+61,Carpet,Carpet
+62,Cell Phone,Cell Phone
+63,Bread,Bread
+64,Camera,Camera
+65,Canned,Canned
+66,Truck,Truck
+67,Traffic cone,Traffic cone
+68,Cymbal,Cymbal
+69,Lifesaver,Lifesaver
+70,Towel,Towel
+71,Stuffed Toy,Stuffed Toy
+72,Candle,Candle
+73,Sailboat,Sailboat
+74,Laptop,Laptop
+75,Awning,Awning
+76,Bed,Bed
+77,Faucet,Faucet
+78,Tent,Tent
+79,Horse,Horse
+80,Mirror,Mirror
+81,Power outlet,Power outlet
+82,Sink,Sink
+83,Apple,Apple
+84,Air Conditioner,Air Conditioner
+85,Knife,Knife
+86,Hockey Stick,Hockey Stick
+87,Paddle,Paddle
+88,Pickup Truck,Pickup Truck
+89,Fork,Fork
+90,Traffic Sign,Traffic Sign
+91,Ballon,Ballon
+92,Tripod,Tripod
+93,Dog,Dog
+94,Spoon,Spoon
+95,Clock,Clock
+96,Pot,Pot
+97,Cow,Cow
+98,Cake,Cake
+99,Dinning Table,Dining Table
+100,Sheep,Sheep
+101,Hanger,Hanger
+102,Blackboard/Whiteboard,Blackboard/Whiteboard
+103,Napkin,Napkin
+104,Other Fish,Other Fish
+105,Orange/Tangerine,Orange/Tangerine
+106,Toiletry,Toiletry
+107,Keyboard,Keyboard
+108,Tomato,Tomato
+109,Lantern,Lantern
+110,Machinery Vehicle,Machinery Vehicle
+111,Fan,Fan
+112,Green Vegetables,Green Vegetables
+113,Banana,Banana
+114,Baseball Glove,Baseball Glove
+115,Airplane,Airplane
+116,Mouse,Mouse
+117,Train,Train
+118,Pumpkin,Pumpkin
+119,Soccer,Soccer
+120,Skiboard,Skiboard
+121,Luggage,Luggage
+122,Nightstand,Nightstand
+123,Tea pot,Teapot
+124,Telephone,Telephone
+125,Trolley,Trolley
+126,Head Phone,Head Phone
+127,Sports Car,Sports Car
+128,Stop Sign,Stop Sign
+129,Dessert,Dessert
+130,Scooter,Scooter
+131,Stroller,Stroller
+132,Crane,Crane
+133,Remote,Remote
+134,Refrigerator,Refrigerator
+135,Oven,Oven
+136,Lemon,Lemon
+137,Duck,Duck
+138,Baseball Bat,Baseball Bat
+139,Surveillance Camera,Surveillance Camera
+140,Cat,Cat
+141,Jug,Jug
+142,Broccoli,Broccoli
+143,Piano,Piano
+144,Pizza,Pizza
+145,Elephant,Elephant
+146,Skateboard,Skateboard
+147,Surfboard,Surfboard
+148,Gun,Gun
+149,Skating and Skiing shoes,Skating and Skiing shoes
+150,Gas stove,Gas stove
+151,Donut,Donut
+152,Bow Tie,Bow Tie
+153,Carrot,Carrot
+154,Toilet,Toilet
+155,Kite,Kite
+156,Strawberry,Strawberry
+157,Other Balls,Other Balls
+158,Shovel,Shovel
+159,Pepper,Pepper
+160,Computer Box,Computer Box
+161,Toilet Paper,Toilet Paper
+162,Cleaning Products,Cleaning Products
+163,Chopsticks,Chopsticks
+164,Microwave,Microwave
+165,Pigeon,Pigeon
+166,Baseball,Baseball
+167,Cutting/chopping Board,Cutting/chopping Board
+168,Coffee Table,Coffee Table
+169,Side Table,Side Table
+170,Scissors,Scissors
+171,Marker,Marker
+172,Pie,Pie
+173,Ladder,Ladder
+174,Snowboard,Snowboard
+175,Cookies,Cookies
+176,Radiator,Radiator
+177,Fire Hydrant,Fire Hydrant
+178,Basketball,Basketball
+179,Zebra,Zebra
+180,Grape,Grape
+181,Giraffe,Giraffe
+182,Potato,Potato
+183,Sausage,Sausage
+184,Tricycle,Tricycle
+185,Violin,Violin
+186,Egg,Egg
+187,Fire Extinguisher,Fire Extinguisher
+188,Candy,Candy
+189,Fire Truck,Fire Truck
+190,Billards,Billiards
+191,Converter,Converter
+192,Bathtub,Bathtub
+193,Wheelchair,Wheelchair
+194,Golf Club,Golf Club
+195,Briefcase,Briefcase
+196,Cucumber,Cucumber
+197,Cigar/Cigarette,Cigar/Cigarette
+198,Paint Brush,Paint Brush
+199,Pear,Pear
+200,Heavy Truck,Heavy Truck
+201,Hamburger,Hamburger
+202,Extractor,Extractor
+203,Extention Cord,Extension Cord
+204,Tong,Tong
+205,Tennis Racket,Tennis Racket
+206,Folder,Folder
+207,American Football,American Football
+208,earphone,earphone
+209,Mask,Mask
+210,Kettle,Kettle
+211,Tennis,Tennis
+212,Ship,Ship
+213,Swing,Swing
+214,Coffee Machine,Coffee Machine
+215,Slide,Slide
+216,Carriage,Carriage
+217,Onion,Onion
+218,Green beans,Green beans
+219,Projector,Projector
+220,Frisbee,Frisbee
+221,Washing Machine/Drying Machine,Washing Machine/Drying Machine
+222,Chicken,Chicken
+223,Printer,Printer
+224,Watermelon,Watermelon
+225,Saxophone,Saxophone
+226,Tissue,Tissue
+227,Toothbrush,Toothbrush
+228,Ice cream,Ice cream
+229,Hotair ballon,Hot air balloon
+230,Cello,Cello
+231,French Fries,French Fries
+232,Scale,Scale
+233,Trophy,Trophy
+234,Cabbage,Cabbage
+235,Hot dog,Hot dog
+236,Blender,Blender
+237,Peach,Peach
+238,Rice,Rice
+239,Wallet/Purse,Wallet/Purse
+240,Volleyball,Volleyball
+241,Deer,Deer
+242,Goose,Goose
+243,Tape,Tape
+244,Tablet,Tablet
+245,Cosmetics,Cosmetics
+246,Trumpet,Trumpet
+247,Pineapple,Pineapple
+248,Golf Ball,Golf Ball
+249,Ambulance,Ambulance
+250,Parking meter,Parking meter
+251,Mango,Mango
+252,Key,Key
+253,Hurdle,Hurdle
+254,Fishing Rod,Fishing Rod
+255,Medal,Medal
+256,Flute,Flute
+257,Brush,Brush
+258,Penguin,Penguin
+259,Megaphone,Megaphone
+260,Corn,Corn
+261,Lettuce,Lettuce
+262,Garlic,Garlic
+263,Swan,Swan
+264,Helicopter,Helicopter
+265,Green Onion,Green Onion
+266,Sandwich,Sandwich
+267,Nuts,Nuts
+268,Speed Limit Sign,Speed Limit Sign
+269,Induction Cooker,Induction Cooker
+270,Broom,Broom
+271,Trombone,Trombone
+272,Plum,Plum
+273,Rickshaw,Rickshaw
+274,Goldfish,Goldfish
+275,Kiwi fruit,Kiwi fruit
+276,Router/modem,Router/modem
+277,Poker Card,Poker Card
+278,Toaster,Toaster
+279,Shrimp,Shrimp
+280,Sushi,Sushi
+281,Cheese,Cheese
+282,Notepaper,Notepaper
+283,Cherry,Cherry
+284,Pliers,Pliers
+285,CD,CD
+286,Pasta,Pasta
+287,Hammer,Hammer
+288,Cue,Cue
+289,Avocado,Avocado
+290,Hamimelon,Hami melon
+291,Flask,Flask
+292,Mushroon,Mushroom
+293,Screwdriver,Screwdriver
+294,Soap,Soap
+295,Recorder,Recorder
+296,Bear,Bear
+297,Eggplant,Eggplant
+298,Board Eraser,Board Eraser
+299,Coconut,Coconut
+300,Tape Measur/ Ruler,Tape Measure/ Ruler
+301,Pig,Pig
+302,Showerhead,Showerhead
+303,Globe,Globe
+304,Chips,Chips
+305,Steak,Steak
+306,Crosswalk Sign,Crosswalk Sign
+307,Stapler,Stapler
+308,Campel,Camel
+309,Formula 1,Formula 1
+310,Pomegranate,Pomegranate
+311,Dishwasher,Dishwasher
+312,Crab,Crab
+313,Hoverboard,Hoverboard
+314,Meat ball,Meatball
+315,Rice Cooker,Rice Cooker
+316,Tuba,Tuba
+317,Calculator,Calculator
+318,Papaya,Papaya
+319,Antelope,Antelope
+320,Parrot,Parrot
+321,Seal,Seal
+322,Buttefly,Butterfly
+323,Dumbbell,Dumbbell
+324,Donkey,Donkey
+325,Lion,Lion
+326,Urinal,Urinal
+327,Dolphin,Dolphin
+328,Electric Drill,Electric Drill
+329,Hair Dryer,Hair Dryer
+330,Egg tart,Egg tart
+331,Jellyfish,Jellyfish
+332,Treadmill,Treadmill
+333,Lighter,Lighter
+334,Grapefruit,Grapefruit
+335,Game board,Game board
+336,Mop,Mop
+337,Radish,Radish
+338,Baozi,Baozi
+339,Target,Target
+340,French,French
+341,Spring Rolls,Spring Rolls
+342,Monkey,Monkey
+343,Rabbit,Rabbit
+344,Pencil Case,Pencil Case
+345,Yak,Yak
+346,Red Cabbage,Red Cabbage
+347,Binoculars,Binoculars
+348,Asparagus,Asparagus
+349,Barbell,Barbell
+350,Scallop,Scallop
+351,Noddles,Noddles
+352,Comb,Comb
+353,Dumpling,Dumpling
+354,Oyster,Oyster
+355,Table Teniis paddle,Table Tennis paddle
+356,Cosmetics Brush/Eyeliner Pencil,Cosmetics Brush/Eyeliner Pencil
+357,Chainsaw,Chainsaw
+358,Eraser,Eraser
+359,Lobster,Lobster
+360,Durian,Durian
+361,Okra,Okra
+362,Lipstick,Lipstick
+363,Cosmetics Mirror,Cosmetics Mirror
+364,Curling,Curling
+365,Table Tennis,Table Tennis
diff --git a/tools/dataset_converters/openimages2odvg.py b/tools/dataset_converters/openimages2odvg.py
new file mode 100644
index 00000000000..ad0bc8075aa
--- /dev/null
+++ b/tools/dataset_converters/openimages2odvg.py
@@ -0,0 +1,187 @@
+import argparse
+import copy
+import csv
+import json
+import os.path as osp
+
+import jsonlines
+from mmcv.image import imfrombytes
+from mmengine.fileio import get
+
+
+def _parse_label_file(label_file):
+    index_list = []
+    classes_names = []
+    with open(label_file, 'r') as f:
+        reader = csv.reader(f)
+        for line in reader:
+            classes_names.append(line[1])
+            index_list.append(line[0])
+    index_mapping = {index: i for i, index in enumerate(index_list)}
+    return classes_names, index_mapping
+
+
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+
+def oi2odvg(args):
+    ann_file = osp.join(args.input_dir, 'oidv6-train-annotations-bbox.csv')
+    label_file = osp.join(args.input_dir, 'class-descriptions-boxable.csv')
+
+    classes_names, index_mapping = _parse_label_file(label_file)
+
+    label_map = {}
+    for class_name, idx in index_mapping.items():
+        class_name = classes_names[idx]
+        label_map[str(idx)] = class_name
+
+    if args.out_ann is None:
+        output = osp.join(args.input_dir, 'openimages_label_map.json')
+    else:
+        output = osp.join(
+            osp.dirname(args.out_ann), 'openimages_label_map.json')
+    with open(output, 'w') as f:
+        json.dump(label_map, f)
+
+    metas = []
+    skip_count = 0
+    with open(ann_file, 'r') as f:
+        reader = csv.reader(f)
+        last_img_id = None
+        _filename_shape = [0, 0]
+        instances = []
+        for i, line in enumerate(reader):
+            if i == 0:
+                continue
+            img_id = line[0]
+            if last_img_id is None:
+                last_img_id = img_id
+            label_id = line[2]
+
+            filename = f'{img_id}.jpg'
+            label = index_mapping[label_id]
+            category = label_map[str(label)]
+            bbox = [
+                float(line[4]),  # xmin
+                float(line[6]),  # ymin
+                float(line[5]),  # xmax
+                float(line[7])  # ymax
+            ]
+
+            # is_occluded = True if int(line[8]) == 1 else False
+            # is_truncated = True if int(line[9]) == 1 else False
+            is_group_of = True if int(line[10]) == 1 else False
+            # is_depiction = True if int(line[11]) == 1 else False
+            # is_inside = True if int(line[12]) == 1 else False
+
+            # if any([is_occluded, is_truncated, is_group_of,
+            # is_depiction, is_inside]):
+            if is_group_of:
+                print(f'skip {filename} of one instance')
+                skip_count += 1
+                continue
+
+            # denormalize
+            if filename != _filename_shape[0]:
+                if args.img_prefix is not None:
+                    _filename = osp.join(
+                        osp.dirname(args.input_dir), args.img_prefix, filename)
+                else:
+                    _filename = osp.join(osp.dirname(args.input_dir), filename)
+                img_bytes = get(_filename, backend_args)
+                img = imfrombytes(img_bytes, flag='color')
+                shape = img.shape
+                _filename_shape = [filename, shape]
+            else:
+                shape = _filename_shape[1]
+
+            h, w = shape[:2]
+            bbox = [
+                max(bbox[0] * w, 0),
+                max(bbox[1] * h, 0),
+                min(bbox[2] * w, w),
+                min(bbox[3] * h, h)
+            ]
+
+            x1, y1, x2, y2 = bbox
+            inter_w = max(0, min(x2, w) - max(x1, 0))
+            inter_h = max(0, min(y2, h) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if w < 1 or h < 1:
+                continue
+
+            instance = {
+                'filename': filename,
+                'height': h,
+                'width': w,
+                'bbox': bbox,
+                'label': label,
+                'category': category
+            }
+
+            if img_id != last_img_id:
+                copy_instances = copy.deepcopy(instances)
+                for copy_instance in copy_instances:
+                    _filename = copy_instance.pop('filename')
+                    _h = copy_instance.pop('height')
+                    _w = copy_instance.pop('width')
+
+                meta_ifo = {
+                    'filename': _filename,
+                    'height': _h,
+                    'width': _w,
+                    'detection': {
+                        'instances': copy_instances
+                    }
+                }
+                metas.append(meta_ifo)
+                instances = []
+            instances.append(instance)
+            last_img_id = img_id
+
+        for instance in instances:
+            _filename = instance.pop('filename')
+            _h = instance.pop('height')
+            _w = instance.pop('width')
+        meta_ifo = {
+            'filename': _filename,
+            'height': _h,
+            'width': _w,
+            'detection': {
+                'instances': instances
+            }
+        }
+        metas.append(meta_ifo)
+
+    if args.out_ann is None:
+        out_path = osp.join(args.input_dir, 'oidv6-train-annotations-vg.jsonl')
+    else:
+        out_path = args.out_ann
+
+    with jsonlines.open(out_path, mode='w') as writer:
+        writer.write_all(metas)
+
+    print('skip {} instances'.format(skip_count))
+    print('save to {}'.format(out_path))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        'openimages to odvg format.', add_help=True)
+    parser.add_argument(
+        '--input-dir',
+        default='data/OpenImages/annotations',
+        type=str,
+        help='input list name')
+    parser.add_argument('--img-prefix', default='OpenImages/train/')
+    parser.add_argument('--out-ann', '-o', type=str)
+    args = parser.parse_args()
+
+    oi2odvg(args)
diff --git a/tools/dataset_converters/refcoco2odvg.py b/tools/dataset_converters/refcoco2odvg.py
new file mode 100644
index 00000000000..c11869b3855
--- /dev/null
+++ b/tools/dataset_converters/refcoco2odvg.py
@@ -0,0 +1,147 @@
+import argparse
+import os.path as osp
+
+import jsonlines
+from pycocotools.coco import COCO
+from tqdm import tqdm
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='refcoco to odvg')
+    parser.add_argument('mdetr_anno_dir', type=str)
+    parser.add_argument('--out-dir', '-o', type=str)
+    args = parser.parse_args()
+    return args
+
+
+def _has_only_empty_bbox(anno):
+    return all(any(o <= 1 for o in obj['bbox'][2:]) for obj in anno)
+
+
+def has_valid_annotation(anno):
+    # if it's empty, there is no annotation
+    if len(anno) == 0:
+        return False
+    # if all boxes have close to zero area, there is no annotation
+    if _has_only_empty_bbox(anno):
+        return False
+    return True
+
+
+def process_item(args, filename):
+    path = osp.join(args.mdetr_anno_dir, filename)
+    coco = COCO(path)
+
+    ids = list(sorted(coco.imgs.keys()))
+
+    out_results = []
+    for img_id in tqdm(ids):
+        if isinstance(img_id, str):
+            ann_ids = coco.getAnnIds(imgIds=[img_id], iscrowd=0)
+        else:
+            ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=0)
+        annos = coco.loadAnns(ann_ids)
+        if not has_valid_annotation(annos):
+            continue
+
+        img_info = coco.loadImgs(img_id)[0]
+        file_name = img_info['file_name']
+        caption = img_info['caption']
+
+        regions = {}
+
+        for anno in annos:
+            box = anno['bbox']
+            tokens_positive = anno['tokens_positive']
+            x1, y1, w, h = box
+            inter_w = max(0, min(x1 + w, int(img_info['width'])) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, int(img_info['height'])) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if anno['area'] <= 0 or w < 1 or h < 1:
+                continue
+
+            if anno.get('iscrowd', False):
+                continue
+            bbox_xyxy = [
+                x1, y1,
+                min(x1 + w, int(img_info['width'])),
+                min(y1 + h, int(img_info['height']))
+            ]
+
+            tokens_positive = sorted(tokens_positive, key=lambda x: x[0])
+
+            phrase = []
+            pre_end_index = -10
+            for token in tokens_positive:
+                start_index = token[0]
+                end_index = token[1]
+                if pre_end_index + 1 == start_index:
+                    if caption[token[0] - 1] == ' ':
+                        phrase[
+                            -1] = phrase[-1] + ' ' + caption[token[0]:token[1]]
+                    else:
+                        phrase.append(caption[token[0]:token[1]])
+                else:
+                    phrase.append(caption[token[0]:token[1]])
+                pre_end_index = end_index
+
+            key = ' '.join(phrase)
+
+            if key not in regions:
+                regions[key] = {
+                    'bbox': bbox_xyxy,
+                    'phrase': phrase,
+                    'tokens_positive': tokens_positive
+                }
+            else:
+                old_box = regions[key]['bbox']
+                if isinstance(old_box[0], list):
+                    old_box.append(bbox_xyxy)
+                else:
+                    old_box = [old_box, bbox_xyxy]
+
+                regions[key]['bbox'] = old_box
+
+        out_dict = {
+            'filename': file_name,
+            'height': int(img_info['height']),
+            'width': int(img_info['width']),
+            'grounding': {
+                'caption': caption
+            }
+        }
+
+        region_list = []
+        for key, value in regions.items():
+            phrase = value['phrase']
+            if len(phrase) == 1:
+                phrase = phrase[0]
+            region_list.append({
+                'bbox': value['bbox'],
+                'phrase': phrase,
+                'tokens_positive': value['tokens_positive']
+            })
+        out_dict['grounding']['regions'] = region_list
+        out_results.append(out_dict)
+
+    if args.out_dir is None:
+        out_path = osp.join(args.mdetr_anno_dir, filename[:-5] + '_vg.json')
+    else:
+        out_path = osp.join(args.out_dir, filename[:-5] + '_vg.json')
+
+    with jsonlines.open(out_path, mode='w') as writer:
+        writer.write_all(out_results)
+    print(f'save to {out_path}')
+
+
+def main():
+    args = parse_args()
+    process_item(args, 'finetune_refcoco_train.json')
+    process_item(args, 'finetune_refcoco+_train.json')
+    process_item(args, 'finetune_refcocog_train.json')
+    process_item(args, 'finetune_grefcoco_train.json')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/misc/split_odvg.py b/tools/misc/split_odvg.py
new file mode 100644
index 00000000000..37fae909859
--- /dev/null
+++ b/tools/misc/split_odvg.py
@@ -0,0 +1,80 @@
+import argparse
+import json
+import os
+import shutil
+
+import jsonlines
+import numpy as np
+from mmengine.utils import ProgressBar, mkdir_or_exist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('data_root', type=str, help='The data root.')
+    parser.add_argument('ann_file', type=str)
+    parser.add_argument('img_prefix', type=str)
+    parser.add_argument(
+        'out_dir',
+        type=str,
+        help='The output directory of coco semi-supervised annotations.')
+    parser.add_argument(
+        '--label-map-file', '-m', type=str, help='label map file')
+    parser.add_argument(
+        '--num-img',
+        '-n',
+        default=200,
+        type=int,
+        help='num of extract image, -1 means all images')
+    parser.add_argument('--seed', default=-1, type=int, help='seed')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    assert args.out_dir != args.data_root, \
+        'The file will be overwritten in place, ' \
+        'so the same folder is not allowed !'
+
+    seed = int(args.seed)
+    if seed != -1:
+        print(f'Set the global seed: {seed}')
+        np.random.seed(int(args.seed))
+
+    ann_file = os.path.join(args.data_root, args.ann_file)
+    with open(ann_file, 'r') as f:
+        data_list = [json.loads(line) for line in f]
+
+    np.random.shuffle(data_list)
+
+    num_img = args.num_img
+
+    progress_bar = ProgressBar(num_img)
+    for i in range(num_img):
+        file_name = data_list[i]['filename']
+        image_path = os.path.join(args.data_root, args.img_prefix, file_name)
+        out_image_dir = os.path.join(args.out_dir, args.img_prefix)
+        mkdir_or_exist(out_image_dir)
+        out_image_path = os.path.join(out_image_dir, file_name)
+        shutil.copyfile(image_path, out_image_path)
+
+        progress_bar.update()
+
+    out_path = os.path.join(args.out_dir, args.ann_file)
+    out_dir = os.path.dirname(out_path)
+    mkdir_or_exist(out_dir)
+
+    with jsonlines.open(out_path, mode='w') as writer:
+        writer.write_all(data_list[:num_img])
+
+    if args.label_map_file is not None:
+        out_dir = os.path.dirname(
+            os.path.join(args.out_dir, args.label_map_file))
+        mkdir_or_exist(out_dir)
+        shutil.copyfile(
+            os.path.join(args.data_root, args.label_map_file),
+            os.path.join(args.out_dir, args.label_map_file))
+
+
+if __name__ == '__main__':
+    main()