Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dml specific change in model builder for phi3.5 #888

Merged
merged 5 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions examples/python/awq-quantized-model.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ def parse_args():
help="Folder to save AWQ-quantized ONNX model and associated files in",
)

parser.add_argument(
"-e",
"--execution_provider",
default="cuda",
help="target execution provider to apply quanization. e.g. dml, cuda",
zhangxiang1993 marked this conversation as resolved.
Show resolved Hide resolved
)

args = parser.parse_args()
return args

Expand Down Expand Up @@ -108,13 +115,14 @@ def main():
input_folder = args.quant_path
output_folder = args.output_path
precision = "int4"
execution_provider = "cuda"
execution_provider = args.execution_provider
cache_dir = os.path.join(".", "cache_dir")

create_model(model_name, input_folder, output_folder, precision, execution_provider, cache_dir)

# Run ONNX model
run_model(args)
if args.execution_provider != "dml":
run_model(args)

if __name__ == "__main__":
main()
76 changes: 60 additions & 16 deletions src/python/py/models/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,11 @@ def make_greater(self, name, inputs, shape):
output = f"{name}/output_0"
self.make_node("Greater", inputs=inputs, outputs=[output], name=name)
self.make_value_info(output, TensorProto.BOOL, shape=shape)

def make_greater_or_equal(self, name, inputs, shape):
output = f"{name}/output_0"
self.make_node("GreaterOrEqual", inputs=inputs, outputs=[output], name=name)
self.make_value_info(output, TensorProto.BOOL, shape=shape)

def make_isinf(self, name, root_input, shape):
output = f"{name}/output_0"
Expand All @@ -597,6 +602,11 @@ def make_reduce_sum(self, name, inputs, dtype, shape):
self.make_node("ReduceSum", inputs=inputs, outputs=[output], name=name)
self.make_value_info(output, dtype, shape=shape)

def make_reduce_max(self, name, inputs, dtype, shape):
output = f"{name}/output_0"
self.make_node("ReduceMax", inputs=inputs, outputs=[output], name=name, keepdims=False)
self.make_value_info(output, dtype, shape=shape)

def make_cast(self, name, root_input, dtype, shape):
output = f"{name}/output_0"
self.make_node("Cast", inputs=[root_input], outputs=[output], name=name, to=dtype)
Expand Down Expand Up @@ -924,7 +934,17 @@ def make_rotary_embedding_caches(self, rotemb, **kwargs):
if self.rotemb_attrs["create_rotary_embedding_caches"]:
if not hasattr(rotemb, "cos_cached"):
# Create cos/sin caches if not already created
cos_cache, sin_cache = self.make_rotary_embedding_caches_from_scratch()
if self.ep == "dml" and type(self).__name__ == "Phi3Mini128KModel":
# concate 4k and 128k cos/sin caches for phi3/phi3.5 and dml EP only
cos_cache_large, sin_cache_large = self.make_rotary_embedding_caches_from_scratch()
self.rotemb_attrs["rescale_factors"] = self.rotemb_attrs["multi_cache"]["short_factor"]
self.rotemb_attrs["cache_length"] = self.original_context_length
self.rotemb_attrs["mscale"] = self.rotemb_attrs["multi_cache"]["short_mscale"]
zhangxiang1993 marked this conversation as resolved.
Show resolved Hide resolved
cos_cache_small, sin_cache_small = self.make_rotary_embedding_caches_from_scratch()
cos_cache = torch.cat((cos_cache_small, cos_cache_large), dim=0)
sin_cache = torch.cat((sin_cache_small, sin_cache_large), dim=0)
else:
cos_cache, sin_cache = self.make_rotary_embedding_caches_from_scratch()
else:
cos_cache, sin_cache = rotemb.cos_cached, rotemb.sin_cached

Expand Down Expand Up @@ -2215,22 +2235,45 @@ def make_position_ids_reformatting(self):
# position_ids input for RotaryEmbedding

basename = "/model/pos_ids_reformat"
shape_name = f"{basename}/Shape"
self.make_shape(shape_name, root_input="input_ids" if not self.exclude_embeds else "inputs_embeds", shape=[2] if not self.exclude_embeds else [3])
gather_name = f"{basename}/Gather"
gather_inputs = [f"{shape_name}/output_0", "/model/constants/TensorProto.INT64/0D/1"]
self.make_gather(gather_name, gather_inputs, axis=0)
unsqueeze_name = f"{basename}/Unsqueeze"
unsqueeze_inputs = [f"{gather_name}/output_0", "/model/constants/TensorProto.INT64/1D/0"]
self.make_unsqueeze(unsqueeze_name, unsqueeze_inputs, dtype=TensorProto.INT64, shape=[1])
concat_name = f"{basename}/Concat"
concat_inputs = ["/model/constants/TensorProto.INT64/1D/-1", f"{unsqueeze_name}/output_0"]
self.make_concat(concat_name, concat_inputs, dtype=TensorProto.INT64, shape=[2], axis=0)
reshape_name = f"{basename}/Reshape"
reshape_inputs = ["position_ids", f"{concat_name}/output_0"]
self.make_reshape(reshape_name, reshape_inputs, dtype=TensorProto.INT64, shape=None)

return reshape_name
if self.ep == "dml":
reduce_max_name = f"{basename}/ReduceMax"
reduce_max_inputs = ["position_ids"]
self.make_reduce_max(reduce_max_name, reduce_max_inputs, dtype=TensorProto.INT64, shape=[1])

zhangxiang1993 marked this conversation as resolved.
Show resolved Hide resolved
greater_or_equal_name = f"{basename}/GreaterOrEqual"
greater_or_equal_inputs = [f"{reduce_max_name}/output_0", f"/model/constants/TensorProto.INT64/0D/{self.original_context_length}"]
self.make_greater_or_equal(greater_or_equal_name, greater_or_equal_inputs, shape=[])

cast_name = f"{basename}/Cast"
self.make_cast(cast_name, f"{greater_or_equal_name}/output_0", dtype=TensorProto.INT64, shape=None)

mul_name = f"{basename}/Mul"
mul_inputs = [f"{cast_name}/output_0", f"/model/constants/TensorProto.INT64/0D/{self.original_context_length}"]
self.make_mul(mul_name, mul_inputs, dtype=TensorProto.INT64, shape=None)

add_1_name = f"{basename}/Add_1"
add_1_inputs = [f"{mul_name}/output_0", "position_ids"]
self.make_add(add_1_name, add_1_inputs, dtype=TensorProto.INT64, shape=["batch_size", "sequence_length"])

return add_1_name
else:
shape_name = f"{basename}/Shape"
self.make_shape(shape_name, root_input="input_ids" if not self.exclude_embeds else "inputs_embeds", shape=[2] if not self.exclude_embeds else [3])
gather_name = f"{basename}/Gather"
gather_inputs = [f"{shape_name}/output_0", "/model/constants/TensorProto.INT64/0D/1"]
self.make_gather(gather_name, gather_inputs, axis=0)
unsqueeze_name = f"{basename}/Unsqueeze"
unsqueeze_inputs = [f"{gather_name}/output_0", "/model/constants/TensorProto.INT64/1D/0"]
self.make_unsqueeze(unsqueeze_name, unsqueeze_inputs, dtype=TensorProto.INT64, shape=[1])
concat_name = f"{basename}/Concat"
concat_inputs = ["/model/constants/TensorProto.INT64/1D/-1", f"{unsqueeze_name}/output_0"]
self.make_concat(concat_name, concat_inputs, dtype=TensorProto.INT64, shape=[2], axis=0)
reshape_name = f"{basename}/Reshape"
reshape_inputs = [f"position_ids", f"{concat_name}/output_0"]
self.make_reshape(reshape_name, reshape_inputs, dtype=TensorProto.INT64, shape=None)

return reshape_name


class LlamaModel(Model):
Expand Down Expand Up @@ -2371,6 +2414,7 @@ def make_mlp_proj(self, layer_id, mlp, root_input):
class Phi3Mini128KModel(Phi3Mini4KModel):
def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
super().__init__(config, io_dtype, onnx_dtype, ep, cache_dir, extra_options)
self.concat_cache = self.ep == "dml"
self.make_rotary_embedding_multi_cache()


Expand Down
Loading