ShareGPT4v infer codes.

Tested on 910B with MS2.3.1.
mindspore-lab · Sep 26, 2024 · 31376c5 · 31376c5
1 parent 1af0b39
commit 31376c5
Show file tree

Hide file tree

Showing 30 changed files with 5,056 additions and 0 deletions.
diff --git a/examples/sharegpt_4v/readme.md b/examples/sharegpt_4v/readme.md
@@ -0,0 +1,13 @@
+# ShareGPT4V: Improving Large Multi-modal Models with Better Captions
+
+[Paper](!https://arxiv.org/pdf/2311.12793.pdf)
+
+[Official Repo](!https://github.com/ShareGPT4Omni/ShareGPT4V)
+
+[Image](!https://raw.githubusercontent.com/ShareGPT4V/ShareGPT4V-Resources/master/images/teaser.png)
+
+
+## Inference
+
+
+1. Prepare weight files:
diff --git a/examples/sharegpt_4v/share4v/__init__.py b/examples/sharegpt_4v/share4v/__init__.py
@@ -0,0 +1 @@
+from .model import Share4VLlamaForCausalLM
diff --git a/examples/sharegpt_4v/share4v/configs/config.json b/examples/sharegpt_4v/share4v/configs/config.json
@@ -0,0 +1,46 @@
+{
+  "_name_or_path": "MS_ShareGPT4V-7B",
+  "architectures": [
+    "Share4VLlamaForCausalLM"
+  ],
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "image_grid_pinpoints": null,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 4096,
+  "mm_hidden_size": 1024,
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower": "/root/congw/project/ms_ShareGPT4V/share4v/configs/vit/",
+  "mm_vision_tower_path":"/root/congw/project/ms_ShareGPT4V/share4v/configs/vit/vit-large336-l12.ckpt",
+  "model_type": "share4v",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "tie_word_embeddings": false,
+  "dtype": "float32",
+  "transformers_version": "4.31.0",
+  "tune_entire_model": false,
+  "tune_mm_mlp_adapter": false,
+  "tune_vision_tower": false,
+  "use_cache": true,
+  "use_mm_proj": true,
+  "vision_tower_lr": null,
+  "vocab_size": 32000,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "use_return_dict": true
+}
diff --git a/examples/sharegpt_4v/share4v/configs/tokenizer_config.json b/examples/sharegpt_4v/share4v/configs/tokenizer_config.json
@@ -0,0 +1,36 @@
+{
+    "add_bos_token": true,
+    "add_eos_token": false,
+    "bos_token": {
+      "__type": "AddedToken",
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    "clean_up_tokenization_spaces": false,
+    "eos_token": {
+      "__type": "AddedToken",
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    "legacy": false,
+    "model_max_length": 2048,
+    "pad_token": null,
+    "padding_side": "right",
+    "sp_model_kwargs": {},
+    "tokenizer_class": "LlamaTokenizer",
+    "unk_token": {
+      "__type": "AddedToken",
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  }
+
diff --git a/examples/sharegpt_4v/share4v/configs/vit/config.json b/examples/sharegpt_4v/share4v/configs/vit/config.json
@@ -0,0 +1,23 @@
+{
+  "_name_or_path": "ShareGPT4V-7B_Pretrained_vit-large336-l12",
+  "architectures": [
+    "CLIPVisionModel"
+  ],
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 1024,
+  "image_size": 336,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 24,
+  "patch_size": 14,
+  "projection_dim": 768,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.31.0"
+}
diff --git a/examples/sharegpt_4v/share4v/configs/vit/preprocessor_config.json b/examples/sharegpt_4v/share4v/configs/vit/preprocessor_config.json
@@ -0,0 +1,29 @@
+{
+    "crop_size": {
+      "height": 336,
+      "width": 336
+    },
+    "do_center_crop": true,
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "feature_extractor_type": "CLIPFeatureExtractor",
+    "image_mean": [
+      0.48145466,
+      0.4578275,
+      0.40821073
+    ],
+    "image_processor_type": "CLIPImageProcessor",
+    "image_std": [
+      0.26862954,
+      0.26130258,
+      0.27577711
+    ],
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+      "shortest_edge": 336
+    }
+  }
+
diff --git a/examples/sharegpt_4v/share4v/constants.py b/examples/sharegpt_4v/share4v/constants.py
@@ -0,0 +1,12 @@
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = "."
+
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"