Merge branch 'cg123:main' into main

Masterjp123 · Jan 31, 2024 · 159b860 · 159b860
2 parents 9e13781 + 031a3c2
commit 159b860
Show file tree

Hide file tree

Showing 6 changed files with 561 additions and 15 deletions.
diff --git a/docs/moe.md b/docs/moe.md
@@ -0,0 +1,38 @@
+# mergekit-moe
+
+`mergekit-moe` is a script for combining Mistral or Llama models of the same size into Mixtral Mixture of Experts models. The script will combine the self-attention and layer normalization parameters from a "base" model with the MLP parameters from a set of "expert" models. `mergekit-moe` uses its own YML configuration syntax, which looks like so:
+
+```yml
+base_model: path/to/self_attn_donor
+gate_mode: hidden # one of "hidden", "cheap_embed", or "random"
+dtype: bfloat16 # output dtype (float32, float16, or bfloat16)
+## (optional)
+# experts_per_token: 2
+experts:
+  - source_model: expert_model_1
+    positive_prompts:
+      - "This is a prompt that is demonstrative of what expert_model_1 excels at"
+    ## (optional)
+    # negative_prompts:
+    #   - "This is a prompt expert_model_1 should not be used for"
+  - source_model: expert_model_2
+  # ... and so on
+```
+
+The script takes two arguments, an input config and an output path: `mergekit-moe ./config.yml ./my-clowncar-moe-12x180B`
+
+## Gate Modes
+
+There are three methods for populating the MoE gates implemented.
+
+### "hidden"
+
+Uses the hidden state representations of the positive/negative prompts for MoE gate parameters. Best quality and most effective option; the default. Requires evaluating each prompt using the base model so you might not be able to use this on constrained hardware (depending on the model). You can use `--load-in-8bit` or `--load-in-4bit` to reduce VRAM usage.
+
+### "cheap_embed"
+
+Uses only the raw token embedding of the prompts, using the same gate parameters for every layer. Distinctly less effective than "hidden". Can be run on much, much lower end hardware.
+
+### "random"
+
+Randomly initializes the MoE gates. Good for if you are going to fine tune the model afterwards, or maybe if you want something a little unhinged? I won't judge.
diff --git a/mergekit/architecture.py b/mergekit/architecture.py
@@ -14,7 +14,7 @@
 # along with this program. If not, see http://www.gnu.org/licenses/.
 
 from abc import ABC, abstractmethod
-from typing import List, Optional
+from typing import ClassVar, List, Optional
 
 from pydantic import BaseModel
 from transformers import PretrainedConfig
@@ -117,6 +117,40 @@ def all_weights(self, config: PretrainedConfig) -> List[str]:
 )
 
 
+class MixtralTensorNames(ArchitectureInfo, BaseModel):
+    ARCHITECTURE_NAME: ClassVar[str] = "MixtralForCausalLM"
+    num_local_experts: int
+
+    @classmethod
+    def from_config(cls, config: PretrainedConfig):
+        return MixtralTensorNames(num_local_experts=config.num_local_experts)
+
+    def pre_weights(self) -> List[str]:
+        return MISTRAL_INFO.pre_weights()
+
+    def post_weights(self) -> List[str]:
+        return MISTRAL_INFO.post_weights()
+
+    def embed_weights(self) -> List[str]:
+        return MISTRAL_INFO.embed_weights()
+
+    def num_layers_config_key(self) -> str:
+        return MISTRAL_INFO.num_layers_config_key()
+
+    def layer_weight_formats(self) -> List[str]:
+        num_experts = self.num_local_experts
+        res = [fmt for fmt in MISTRAL_INFO.layer_weight_formats() if ".mlp." not in fmt]
+        for expert_idx in range(num_experts):
+            for param in ("w1", "w2", "w3"):
+                fmt = (
+                    MISTRAL_INFO.layer_prefix_format
+                    + f".block_sparse_moe.experts.{expert_idx}.{param}.weight"
+                )
+                res.append(fmt)
+        res.append(MISTRAL_INFO.layer_prefix_format + ".block_sparse_moe.gate.weight")
+        return res
+
+
 STABLELM_INFO = StaticTensorNames(
     name="StableLMEpochForCausalLM",
     post_weight_names=LLAMA_INFO.post_weight_names + ["model.norm.bias"],
@@ -289,29 +323,25 @@ def all_weights(self, config: PretrainedConfig) -> List[str]:
 )
 
 
-class PhiTensorNames(ArchitectureInfo):
-    architecture_name: str = "MixFormerSequentialForCausalLM"
-
-    def __init__(self, config: PretrainedConfig):
-        self.config = config
+class PhiTensorNames(ArchitectureInfo, BaseModel):
+    ARCHITECTURE_NAME: ClassVar[str] = "MixFormerSequentialForCausalLM"
+    n_layer: int
 
-    def __eq__(self, rhs: "PhiTensorNames"):
-        if not isinstance(rhs, PhiTensorNames):
-            return False
-        return self.num_layers() == rhs.num_layers()
+    def from_config(cls, config: PretrainedConfig):
+        return PhiTensorNames(n_layer=config.n_layer)
 
     def pre_weights(self) -> List[str]:
         return ["layers.0.wte.weight"]
 
     def post_weights(self) -> List[str]:
-        fake_layer_idx = self.config.n_layer + 1
+        fake_layer_idx = self.n_layer
         return [
             f"layers.{fake_layer_idx}.{suffix}"
             for suffix in ["linear.bias", "linear.weight", "ln.bias", "ln.weight"]
         ]
 
     def embed_weights(self) -> List[str]:
-        fake_layer_idx = self.config.n_layer + 1
+        fake_layer_idx = self.n_layer
         return [
             "layers.0.wte.weight",
             f"layers.{fake_layer_idx}.linear.weight",
@@ -423,8 +453,10 @@ def get_architecture_info(config: PretrainedConfig) -> StaticTensorNames:
         raise RuntimeError("More than one architecture in config?")
 
     arch_name = config.architectures[0]
-    if arch_name == PhiTensorNames.architecture_name:
-        return PhiTensorNames(config)
+    if arch_name == PhiTensorNames.ARCHITECTURE_NAME:
+        return PhiTensorNames.from_config(config)
+    if arch_name == MixtralTensorNames.ARCHITECTURE_NAME:
+        return MixtralTensorNames.from_config(config)
 
     if arch_name == PHI2_INFO.name:
         if config.model_type == "phi-msft":

diff --git a/mergekit/common.py b/mergekit/common.py
@@ -173,6 +173,9 @@ def __str__(self) -> str:
 
 
 def dtype_from_name(name: Optional[str]) -> torch.dtype:
+    if name.startswith("torch."):
+        name = name[len("torch.") :]
+
     if name == "bfloat16":
         return torch.bfloat16
     elif name == "float16":

diff --git a/mergekit/io/lazy_tensor_loader.py b/mergekit/io/lazy_tensor_loader.py
@@ -98,7 +98,7 @@ def from_disk(cls, base_path: str) -> "ShardedTensorIndex":
                     tensor_paths = {key: shard_name for key in st.keys()}
             else:
                 # this is ugly but not much else can be done
-                shard = torch.load(model_path)
+                shard = torch.load(model_path, map_location="meta")
                 if "state_dict" in shard:
                     shard = shard["state_dict"]