diff --git a/examples/stable_diffusion_v2/README.md b/examples/stable_diffusion_v2/README.md index e0b109d592..f93957a184 100644 --- a/examples/stable_diffusion_v2/README.md +++ b/examples/stable_diffusion_v2/README.md @@ -73,6 +73,7 @@ The compatible framework versions that are well-tested are listed as follows. | 910 | 2.0 | 6.3 RC1 | 23.0.rc1 | 3.7.16 | master (4c33849) | | 910 | 2.1 | 6.3 RC2 | 23.0.rc2 | 3.9.18 | master (4c33849) | | 910* | 2.2.1 (20231124) | 7.1 | 23.0.rc3.6 | 3.7.16 | master (4c33849) | +| 910* | 2.3.0 | 7.3 | 23.0.3 | 3.8.8 | master | @@ -281,7 +282,6 @@ To run vanilla fine-tuning, we will use the `train_text_to_image.py` script foll --output_path {path to output directory} \ --pretrained_model_path {path to pretrained checkpoint file} ``` - > Please enable INFNAN mode by `export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE"` for Ascend 910* if overflow found. Take fine-tuning SD1.5 on the Pokemon dataset as an example: @@ -319,7 +319,6 @@ For parallel training on multiple Ascend NPUs, please refer to the instructions ```shell bash scripts/run_train_distributed.sh ``` - > Please enable INFNAN mode by `export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE"` for Ascend 910* if overflow found. After launched, the training process can be traced by running `tail -f ouputs/train_txt2img/rank_0/train.log`. diff --git a/examples/stable_diffusion_v2/benchmark.md b/examples/stable_diffusion_v2/benchmark.md index 5184de8733..c5bdaadac3 100644 --- a/examples/stable_diffusion_v2/benchmark.md +++ b/examples/stable_diffusion_v2/benchmark.md @@ -4,41 +4,45 @@ ### Training -| SD Model | Context | Method | Global Batch Size x Grad. Accu. | Resolution | Acceleration | FPS (img/s) | -|---------------|---------------|--------------|:-------------------:|:------------------:|:----------------:|:----------------:| -| 1.5 | D910x1-MS2.1 | Vanilla | 3x1 | 512x512 | Graph, DS, FP16, | 5.98 | -| 1.5 | D910x8-MS2.1 | Vanilla | 24x1 | 512x512 | Graph, DS, FP16, | 31.18 | -| 1.5 | D910x1-MS2.1 | LoRA | 4x1 | 512x512 | Graph, DS, FP16, | 8.25 | -| 1.5 | D910x8-MS2.1 | LoRA | 32x1 | 512x512 | Graph, DS, FP16, | 63.85 | -| 1.5 | D910x1-MS2.1 | Dreambooth | 1x1 | 512x512 | Graph, DS, FP16, | 2.09 | -| 2.0 | D910x1-MS2.1 | Vanilla | 3x1 | 512x512 | Graph, DS, FP16, | 6.19 | -| 2.0 | D910x8-MS2.1 | Vanilla | 24x1 | 512x512 | Graph, DS, FP16, | 33.50 | -| 2.0 | D910x1-MS2.1 | LoRA | 4x1 | 512x512 | Graph, DS, FP16, | 9.46 | -| 2.0 | D910x8-MS2.1 | LoRA | 32x1 | 512x512 | Graph, DS, FP16, | 73.51 | -| 2.0 | D910x1-MS2.1 | Dreambooth | 1x1 | 512x512 | Graph, DS, FP16, | 2.18 | -| 2.1-v | D910x1-MS2.1 | Vanilla | 3x1 | 768x768 | Graph, DS, FP16, FA | 3.16 | -| 2.1-v | D910x8-MS2.1 | Vanilla | 24x1 | 768x768 | Graph, DS, FP16, FA | 18.98 | -| 2.1-v | D910x1-MS2.1 | LoRA | 4x1 | 768x768 | Graph, DS, FP16, FA | 3.39 | -| 2.1-v | D910x8-MS2.1 | LoRA | 32x1 | 768x768 | Graph, DS, FP16, FA | 23.45 | -| 1.5 | D910*x1-MS2.2.10 | Vanilla | 3x1 | 512x512 | Graph, DS, FP16, | 9.22 | -| 1.5 | D910*x8-MS2.2.10 | Vanilla | 24x1 | 512x512 | Graph, DS, FP16, | 52.30 | -| 1.5 | D910*x1-MS2.2.10 | LoRA | 4x1 | 512x512 | Graph, DS, FP16, | 13.58 | -| 1.5 | D910*x8-MS2.2.10 | LoRA | 32x1 | 512x512 | Graph, DS, FP16, | 105.08 | -| 1.5 | D910*x1-MS2.2.10 | Dreambooth | 1x1 | 512x512 | Graph, DS, FP16, | 2.92 | -| 2.0 | D910*x1-MS2.2.10 | Vanilla | 3x1 | 512x512 | Graph, DS, FP16, | 10.03 | -| 2.0 | D910*x8-MS2.2.10 | Vanilla | 24x1 | 512x512 | Graph, DS, FP16, | 55.69 | -| 2.0 | D910*x1-MS2.2.10 | LoRA | 4x1 | 512x512 | Graph, DS, FP16, | 15.88 | -| 2.0 | D910*x8-MS2.2.10 | LoRA | 32x1 | 512x512 | Graph, DS, FP16, | 119.74 | -| 2.0 | D910*x1-MS2.2.10 | Dreambooth | 1x1 | 512x512 | Graph, DS, FP16, | 2.93 | -| 2.1-v | D910*x1-MS2.2.10 | Vanilla | 3x1 | 768x768 | Graph, DS, FP16, | 5.80 | -| 2.1-v | D910*x1-MS2.2.10 | Vanilla | 24x1 | 768x768 | Graph, DS, FP16, | 46.02 | -| 2.1-v | D910*x1-MS2.2.10 | LoRA | 4x1 | 768x768 | Graph, DS, FP16, | 6.65 | -| 2.1-v | D910*x8-MS2.2.10 | LoRA | 32x1 | 768x768 | Graph, DS, FP16, | 52.57 | +| SD Model | Context | Method | Global Batch Size x Grad. Accu. | Resolution | Acceleration | jit_level |FPS (img/s) | +|---------------|---------------|--------------|:-------------------:|:------------------:|:----------------:|:----------------:|----------:| +| 1.5 | D910x1-MS2.1 | Vanilla | 3x1 | 512x512 | Graph, DS, FP16, | N/A | 5.98 | +| 1.5 | D910x8-MS2.1 | Vanilla | 24x1 | 512x512 | Graph, DS, FP16, | N/A | 31.18 | +| 1.5 | D910x1-MS2.1 | LoRA | 4x1 | 512x512 | Graph, DS, FP16, | N/A | 8.25 | +| 1.5 | D910x8-MS2.1 | LoRA | 32x1 | 512x512 | Graph, DS, FP16, | N/A | 63.85 | +| 1.5 | D910x1-MS2.1 | Dreambooth | 1x1 | 512x512 | Graph, DS, FP16, | N/A | 2.09 | +| 2.0 | D910x1-MS2.1 | Vanilla | 3x1 | 512x512 | Graph, DS, FP16, | N/A | 6.19 | +| 2.0 | D910x8-MS2.1 | Vanilla | 24x1 | 512x512 | Graph, DS, FP16, | N/A | 33.50 | +| 2.0 | D910x1-MS2.1 | LoRA | 4x1 | 512x512 | Graph, DS, FP16, | N/A | 9.46 | +| 2.0 | D910x8-MS2.1 | LoRA | 32x1 | 512x512 | Graph, DS, FP16, | N/A | 73.51 | +| 2.0 | D910x1-MS2.1 | Dreambooth | 1x1 | 512x512 | Graph, DS, FP16, | N/A | 2.18 | +| 2.1-v | D910x1-MS2.1 | Vanilla | 3x1 | 768x768 | Graph, DS, FP16, FA | N/A | 3.16 | +| 2.1-v | D910x8-MS2.1 | Vanilla | 24x1 | 768x768 | Graph, DS, FP16, FA | N/A | 18.98 | +| 2.1-v | D910x1-MS2.1 | LoRA | 4x1 | 768x768 | Graph, DS, FP16, FA | N/A | 3.39 | +| 2.1-v | D910x8-MS2.1 | LoRA | 32x1 | 768x768 | Graph, DS, FP16, FA | N/A | 23.45 | +| 1.5 | D910*x1-MS2.3.0 | Vanilla | 3x1 | 512x512 | Graph, DS, FP16, | O2 | 11.86 | +| 1.5 | D910*x8-MS2.3.0 | Vanilla | 24x1 | 512x512 | Graph, DS, FP16, | O2 | 75.53 | +| 1.5 | D910*x1-MS2.3.0 | LoRA | 4x1 | 512x512 | Graph, DS, FP16, | O2 | 15.27 | +| 1.5 | D910*x8-MS2.3.0 | LoRA | 32x1 | 512x512 | Graph, DS, FP16, | O2 | 119.94 | +| 1.5 | D910*x1-MS2.3.0 | Dreambooth | 1x1 | 512x512 | Graph, DS, FP16, | O2 | 3.86 | +| 2.0 | D910*x1-MS2.3.0 | Vanilla | 3x1 | 512x512 | Graph, DS, FP16, | O2 | 12.75 | +| 2.0 | D910*x8-MS2.3.0 | Vanilla | 24x1 | 512x512 | Graph, DS, FP16, | O2 | 79.67 | +| 2.0 | D910*x1-MS2.3.0 | LoRA | 4x1 | 512x512 | Graph, DS, FP16, | O2 | 16.53 | +| 2.0 | D910*x8-MS2.3.0 | LoRA | 32x1 | 512x512 | Graph, DS, FP16, | O2 | 129.70 | +| 2.0 | D910*x1-MS2.3.0 | Dreambooth | 1x1 | 512x512 | Graph, DS, FP16, | O2 | 3.76 | +| 2.1-v | D910*x1-MS2.3.0 | Vanilla | 3x1 | 768x768 | Graph, DS, FP16, FA | 02 | 7.16 | +| 2.1-v | D910*x8-MS2.3.0 | Vanilla | 24x1 | 768x768 | Graph, DS, FP16, FA | 02 | 49.27 | +| 2.1-v | D910*x1-MS2.3.0 | LoRA | 4x1 | 768x768 | Graph, DS, FP16, FA | 02 | 9.51 | +| 2.1-v | D910*x8-MS2.3.0 | LoRA | 32x1 | 768x768 | Graph, DS, FP16, FA | 02 | 71.51 | > Context: {Ascend chip}-{number of NPUs}-{mindspore version}. > > Acceleration: DS: data sink mode, FP16: float16 computation. FA: flash attention. > >FPS: images per second during training. average training time (s/step) = batch_size / FPS +> +>jie_level: Used to control the compilation optimization level. N/A means that the current MindSpore version does not support setting jit_level. + +Note that the jit_level only can be used for MindSpore 2.3. Note that the performance of SD2.1 should be similar to SD2.0 since they have the same network architecture. @@ -57,20 +61,25 @@ Flash Attention, ### Inference -| SD Model | Context | Scheduler | Steps | Resolution | Batch Size | Speed (step/s) | FPS (img/s) | -|---------------|:-----------|:------------:|:------------------:|:----------------:|:----------------:|:----------------:|:----------------:| -| 1.5 | D910x1-MS2.2.10 | DDIM | 30 | 512x512 | 4 | 3.58 | 0.44 | -| 2.0 | D910x1-MS2.2.10 | DDIM | 30 | 512x512 | 4 | 4.12 | 0.49 | -| 2.1-v | D910x1-MS2.2.10 | DDIM | 30 | 768x768 | 4 | 1.14 | 0.14 | -| 1.5 | D910*x1-MS2.2.10 | DDIM | 30 | 512x512 | 4 | 6.19 | 0.71 | -| 2.0 | D910*x1-MS2.2.10 | DDIM | 30 | 512x512 | 4 | 7.65 | 0.83 | -| 2.1-v | D910*x1-MS2.2.10 | DDIM | 30 | 768x768 | 4 | 2.79 | 0.32 | +| SD Model | Context | Scheduler | Steps | Resolution | Batch Size | jit_level | Speed (step/s) | FPS (img/s) | +|---------------|------------|--------------|:-------------------:|:-------------:|:----------------:|:----------------:|----------:|----------| +| 1.5 | D910x1-MS2.2.10 | DDIM | 30 | 512x512 | 4 | N/A | 3.58 | 0.44 | +| 2.0 | D910x1-MS2.2.10 | DDIM | 30 | 512x512 | 4 | N/A | 4.12 | 0.49 | +| 2.1-v | D910x1-MS2.2.10 | DDIM | 30 | 768x768 | 4 | N/A | 1.14 | 0.14 | +| 1.5 | D910*x1-MS2.3.0 | DDIM | 30 | 512x512 | 4 | O2 | 6.69 | 0.77 | +| 2.0 | D910*x1-MS2.3.0 | DDIM | 30 | 512x512 | 4 | O2 | 8.30 | 0.91 | +| 2.1-v | D910*x1-MS2.3.0 | DDIM | 30 | 768x768 | 4 | O2 | 2.91 | 0.36 | + + > Context: {Ascend chip}-{number of NPUs}-{mindspore version}. > > Speed (step/s): sampling speed measured in the number of sampling steps per second. > > FPS (img/s): image generation throughput measured in the number of image generated per second. +> +>jie_level: Used to control the compilation optimization level. N/A means that the current MindSpore version does not support setting jit_level. +Note that the jit_level only can be used for MindSpore 2.3. Note that the performance of SD2.1 should be similar to SD2.0 since they have the same network architecture. Performance per NPU in multi-NPU parallel mode is the same as performance of single NPU mode. diff --git a/examples/stable_diffusion_v2/common.py b/examples/stable_diffusion_v2/common.py index 1b704c00e3..ddc6538078 100644 --- a/examples/stable_diffusion_v2/common.py +++ b/examples/stable_diffusion_v2/common.py @@ -18,9 +18,11 @@ def init_env( seed: int = 42, distributed: bool = False, device_target: Optional[str] = "Ascend", + jit_level: str = "O2", enable_modelarts: bool = False, num_workers: int = 1, json_data_path: Optional[str] = None, + max_device_memory: Optional[str] = "1024GB", ) -> Tuple[int, int, int]: """ Initialize MindSpore environment. @@ -40,6 +42,20 @@ def init_env( A tuple containing the device ID, rank ID and number of devices. """ set_random_seed(seed) + if mode == ms.GRAPH_MODE: + try: + if jit_level in ["O0", "O1", "O2"]: + ms.set_context(jit_config={"jit_level": jit_level}) + _logger.info(f"set jit_level: {jit_level}.") + else: + _logger.warning( + f"Unsupport jit_level: {jit_level}. The framework automatically selects the execution method" + ) + except Exception: + _logger.warning( + "The current jit_level is not suitable because current MindSpore version does not match," + "please ensure the MindSpore version >= ms2.3.0." + ) if debug and mode == ms.GRAPH_MODE: # force PyNative mode when debugging _logger.warning("Debug mode is on, switching execution mode to PyNative.") @@ -52,6 +68,7 @@ def init_env( device_target=device_target, device_id=device_id, ascend_config={"precision_mode": "allow_fp32_to_fp16"}, # Only effective on Ascend 910* + max_device_memory=max_device_memory, ) init() device_num = get_group_size() @@ -80,6 +97,7 @@ def init_env( device_id=device_id, ascend_config={"precision_mode": "allow_fp32_to_fp16"}, # Only effective on Ascend 910* pynative_synchronize=debug, + max_device_memory=max_device_memory, ) return device_id, rank_id, device_num diff --git a/examples/stable_diffusion_v2/configs/train/train_config_vanilla_v2_vpred.yaml b/examples/stable_diffusion_v2/configs/train/train_config_vanilla_v2_vpred.yaml index 4208816d7b..6812ae5c8d 100644 --- a/examples/stable_diffusion_v2/configs/train/train_config_vanilla_v2_vpred.yaml +++ b/examples/stable_diffusion_v2/configs/train/train_config_vanilla_v2_vpred.yaml @@ -17,6 +17,7 @@ ckpt_save_interval: 1 epochs: 20 use_ema: True clip_grad: False +enable_flash_attention: True # lr scheduler scheduler: "cosine_decay" diff --git a/examples/stable_diffusion_v2/configs/v2-vpred-inference.yaml b/examples/stable_diffusion_v2/configs/v2-vpred-inference.yaml index f24750e139..c6b00dca0a 100644 --- a/examples/stable_diffusion_v2/configs/v2-vpred-inference.yaml +++ b/examples/stable_diffusion_v2/configs/v2-vpred-inference.yaml @@ -32,6 +32,7 @@ model: channel_mult: [ 1, 2, 4, 4 ] num_head_channels: 64 use_spatial_transformer: True + enable_flash_attention: True use_linear_in_transformer: True transformer_depth: 1 context_dim: 1024 diff --git a/examples/stable_diffusion_v2/depth_to_image.py b/examples/stable_diffusion_v2/depth_to_image.py index 9f9286eebc..89f20173af 100644 --- a/examples/stable_diffusion_v2/depth_to_image.py +++ b/examples/stable_diffusion_v2/depth_to_image.py @@ -258,7 +258,6 @@ def main(args): rank=0, log_level=eval(args.log_level), ) - # init device_id = int(os.getenv("DEVICE_ID", 0)) ms.context.set_context( @@ -268,6 +267,20 @@ def main(args): device_id=device_id, max_device_memory="30GB", ) + if args.ms_mode == ms.GRAPH_MODE: + try: + if args.jit_level in ["O0", "O1", "O2"]: + ms.set_context(jit_config={"jit_level": args.jit_level}) + logger.info(f"set jit_level: {args.jit_level}.") + else: + logger.warning( + f"Unsupport jit_level: {args.jit_level}. The framework automatically selects the execution method" + ) + except Exception: + logger.warning( + "The current jit_level is not suitable because current MindSpore version does not match," + "please ensure the MindSpore version >= ms2.3.0." + ) if args.save_graph: save_graphs_path = "graph" @@ -416,6 +429,16 @@ def load_model_from_config(config, ckpt, verbose=False): parser.add_argument( "--ms_mode", type=int, default=0, help="Running in GRAPH_MODE(0) or PYNATIVE_MODE(1) (default=0)" ) + parser.add_argument( + "--jit_level", + default="O2", + type=str, + choices=["O0", "O1", "O2"], + help="Used to control the compilation optimization level. Supports [“O0”, “O1”, “O2”]." + "O0: Except for optimizations that may affect functionality, all other optimizations are turned off, adopt KernelByKernel execution mode." + "O1: Using commonly used optimizations and automatic operator fusion optimizations, adopt KernelByKernel execution mode." + "O2: Ultimate performance optimization, adopt Sink execution mode.", + ) parser.add_argument("--num_samples", type=int, default=4, help="num of total samples") parser.add_argument( "--img_size", diff --git a/examples/stable_diffusion_v2/docs/en/controlnet.md b/examples/stable_diffusion_v2/docs/en/controlnet.md index 81b340d37f..9f7a46be5a 100644 --- a/examples/stable_diffusion_v2/docs/en/controlnet.md +++ b/examples/stable_diffusion_v2/docs/en/controlnet.md @@ -184,7 +184,7 @@ Try to customize your own ControlNet? Downloading the SD base weight, preparing Now only support training based on SD1.5.ControlNet -All codes have been tested on Ascend 910* with MindSpore 2.2 20231124 version. +All codes have been tested on Ascend 910* with MindSpore 2.3.0 release. ### Train a ControlNet from SD1.5 @@ -297,17 +297,25 @@ Final, execute the script to launch finetuning ``` sh scripts/run_train_cldm.sh $CARD_ID ``` -> Please enable INFNAN mode by `export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE"` for Ascend 910* if overflow found. The resulting log will be saved in $output_dir as defined in the script, and the saved checkpoint will be saved in $output_path as defined in `train_config` file. Here are the training performances: -| Platform | Dataset | Task | Batch Size | Training Performance | -| -------- | ------- | ---- | --------- | ------------------- | -| 910A | Fill50k | Canny2Image | 4 | 620 ms/step| -| 910* | Fill50k | Canny2Image | 4 | 552 ms/step| -| 910A | MPII1K | Pose2Image | 2 | 490 ms/step| +| Context | Dataset | Task | Batch Size | jit_level| Acceleration |Training Performance | +| -------- | ------- | ---- | --------- |-----------|---------- | ---| +| D910x1-MS2.0.0 | Fill50k | Canny2Image | 4 | N.A| Graph,FP16 | 620 ms/step| +| D910x1-MS2.0.0 | MPII1K | Pose2Image | 2 | N.A| Graph,FP16 | 490 ms/step| +| D910*x1-MS2.3.0 | Fill50k | Canny2Image | 4 | O2| Graph,FP16 |377 ms/step| +| D910*x1-MS2.3.0 | MPII1K | Canny2Image | 2 | O2| Graph,FP16 |253 ms/step| + +> Context: {Ascend chip}-{number of NPUs}-{mindspore version}. +> +> Acceleration: Graph: Graph Mode. FP16: float16 computation. +> +>jie_level: Used to control the compilation optimization level. N/A means that the current MindSpore version does not support setting jit_level. + +Note that the jit_level only can be used for MindSpore 2.3. #### 4. Evaluation diff --git a/examples/stable_diffusion_v2/docs/en/dreambooth_finetune.md b/examples/stable_diffusion_v2/docs/en/dreambooth_finetune.md index 28378b7d21..5943ea0893 100644 --- a/examples/stable_diffusion_v2/docs/en/dreambooth_finetune.md +++ b/examples/stable_diffusion_v2/docs/en/dreambooth_finetune.md @@ -101,7 +101,6 @@ python train_dreambooth.py \ --output_path "output/dreambooth_dog/txt2img" \ --pretrained_model_path "models/sd_v2_base-57526ee4.ckpt" ``` -> Please enable INFNAN mode by `export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE"` for Ascend 910* if overflow found. To modify other important hyper-parameters, please refer to training config file `train_config_dreambooth_v2.yaml`. @@ -129,7 +128,6 @@ python train_dreambooth.py \ --output_path "output/dreambooth_vanilla_dog/txt2img" \ --pretrained_model_path "models/sd_v2_base-57526ee4.ckpt" ``` -> Please enable INFNAN mode by `export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE"` for Ascend 910* if overflow found. #### 2.2.5 Training Command for DreamBooth with LoRA @@ -147,8 +145,7 @@ python train_dreambooth.py \ --output_path "output/dreambooth_lora_dog/txt2img" \ --pretrained_model_path "models/sd_v2_base-57526ee4.ckpt" ``` -> Please enable INFNAN mode by `export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE"` for Ascend 910* if overflow found. -> + Note that we train the LoRA parameters with a constant learning rate `5e-5`, a weight decay `1e-4 ` for 4 epochs (800 steps). The rank of the LoRA parameter is 64. diff --git a/examples/stable_diffusion_v2/docs/en/lora_finetune.md b/examples/stable_diffusion_v2/docs/en/lora_finetune.md index b39bd9b30f..b2e3aa9b87 100644 --- a/examples/stable_diffusion_v2/docs/en/lora_finetune.md +++ b/examples/stable_diffusion_v2/docs/en/lora_finetune.md @@ -54,7 +54,6 @@ python train_text_to_image.py \ --output_path {path to output directory} \ --pretrained_model_path {path to pretrained checkpoint file} ``` -> Please enable INFNAN mode by `export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE"` for Ascend 910* if overflow found. The training configurations are specified via the `train_config` argument, including model architecture and the training hyper-parameters such as `lora_rank`. @@ -81,7 +80,6 @@ python train_text_to_image.py \ --output_path output/lora_pokemon \ --pretrained_model_path models/sd_v1.5-d0ab7146.ckpt ``` -> Please enable INFNAN mode by `export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE"` for Ascend 910* if overflow found. The trained LoRA checkpoints will be saved in `output/lora_pokemon/ckpt`. @@ -98,7 +96,6 @@ python train_text_to_image.py \ --output_path output/lora_chinese_art \ --pretrained_model_path models/sd_v2-1_base-7c8d09ce.ckpt ``` -> Please enable INFNAN mode by `export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE"` for Ascend 910* if overflow found. The trained LoRA checkpoints will be saved in `output/lora_chinese_art/ckpt`. @@ -115,7 +112,6 @@ python text_to_image.py \ --use_lora True \ --lora_ckpt_path {path/to/lora_checkpoint_after_finetune} ``` -> Please enable INFNAN mode by `export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE"` for Ascend 910* if overflow found. Please update `lora_ckpt_path` according to your fine-tuning settings. diff --git a/examples/stable_diffusion_v2/inference/README.md b/examples/stable_diffusion_v2/inference/README.md index a7f4043cda..37889a9658 100644 --- a/examples/stable_diffusion_v2/inference/README.md +++ b/examples/stable_diffusion_v2/inference/README.md @@ -149,6 +149,6 @@ You can get images at "output/samples". | sd-2.0-base_fa | text2img | 1 | 512*512 | 50 | Ascend 910 | MindSpore | 5.49 s | | sd-2.0-base-fa | text2img | 1 | 512*512 | 50 | Ascend 910 | Lite | 3.21 s | | sd-2.0-base-fa | text2img | 1 | 512*512 | 50 | Ascend 910* | Lite | 2.7 s | -| sd-1.5 | text2img | 1 | 512*512 | 50 | Ascend 910* | MindSpore | 4.57 s | +| sd-1.5 | text2img | 1 | 512*512 | 50 | Ascend 910*-MS2.3.0 | MindSpore | 2.87 s | The sampler schedule is DDIM. diff --git a/examples/stable_diffusion_v2/inference/libs/helper.py b/examples/stable_diffusion_v2/inference/libs/helper.py index 5def7d1a42..c8cf054f34 100644 --- a/examples/stable_diffusion_v2/inference/libs/helper.py +++ b/examples/stable_diffusion_v2/inference/libs/helper.py @@ -91,6 +91,20 @@ def set_env(args): # set ms context device_id = int(os.getenv("DEVICE_ID", 0)) ms.context.set_context(mode=args.ms_mode, device_id=device_id) + if args.ms_mode == ms.GRAPH_MODE: + try: + if args.jit_level in ["O0", "O1", "O2"]: + ms.set_context(jit_config={"jit_level": args.jit_level}) + logger.info(f"set jit_level: {args.jit_level}.") + else: + logger.warning( + f"Unsupport jit_level: {args.jit_level}. The framework automatically selects the execution method" + ) + except Exception: + logger.warning( + "The current jit_level is not suitable because current MindSpore version does not match," + "please ensure the MindSpore version >= ms2.3.0." + ) set_random_seed(args.seed) diff --git a/examples/stable_diffusion_v2/inference/sd_infer.py b/examples/stable_diffusion_v2/inference/sd_infer.py index 115e1c7c6a..2d02d1eb17 100644 --- a/examples/stable_diffusion_v2/inference/sd_infer.py +++ b/examples/stable_diffusion_v2/inference/sd_infer.py @@ -230,6 +230,16 @@ def main(args): parser.add_argument( "--ms_mode", type=int, default=0, help="Running in GRAPH_MODE(0) or PYNATIVE_MODE(1) (default=0)" ) + parser.add_argument( + "--jit_level", + default="O2", + type=str, + choices=["O0", "O1", "O2"], + help="Used to control the compilation optimization level. Supports ['O0', 'O1', 'O2']." + "O0: Except for optimizations that may affect functionality, all other optimizations are turned off, adopt KernelByKernel execution mode." + "O1: Using commonly used optimizations and automatic operator fusion optimizations, adopt KernelByKernel execution mode." + "O2: Ultimate performance optimization, adopt Sink execution mode.", + ) parser.add_argument( "--device_target", type=str, default="Ascend", help="Device target, should be in [Ascend, GPU, CPU]" ) diff --git a/examples/stable_diffusion_v2/inpaint.py b/examples/stable_diffusion_v2/inpaint.py index 7fc99c3a39..ff83f4e5e4 100644 --- a/examples/stable_diffusion_v2/inpaint.py +++ b/examples/stable_diffusion_v2/inpaint.py @@ -18,13 +18,13 @@ workspace = os.path.dirname(os.path.abspath(__file__)) print("workspace:", workspace, flush=True) sys.path.append(workspace) +from common import init_env from ldm.models.diffusion.plms import PLMSSampler # from ldm.models.diffusion.ddim import DDIMSampler # from ldm.models.diffusion.dpm_solver import DPMSolverSampler # from ldm.models.diffusion.uni_pc import UniPCSampler from ldm.modules.logger import set_logger -from ldm.modules.train.tools import set_random_seed from ldm.util import instantiate_from_config logger = logging.getLogger("inpaint") @@ -125,6 +125,14 @@ def image_grid(imgs, rows, cols): def main(args): + # init + device_id, _, _ = init_env( + args.ms_mode, + seed=args.seed, + jit_level=args.jit_level, + max_device_memory="30GB", + ) + # set logger set_logger( name="", @@ -133,23 +141,11 @@ def main(args): log_level=eval(args.log_level), ) - # init - device_id = int(os.getenv("DEVICE_ID", 0)) - ms.context.set_context( - mode=args.ms_mode, - # mode=ms.context.GRAPH_MODE, - device_target="Ascend", - device_id=device_id, - max_device_memory="30GB", - ) - if args.save_graph: save_graphs_path = "graph" shutil.rmtree(save_graphs_path) ms.context.set_context(save_graphs=True, save_graphs_path=save_graphs_path) - set_random_seed(args.seed) - if not os.path.isabs(args.config): args.config = os.path.join(workspace, args.config) config = OmegaConf.load(f"{args.config}") @@ -282,6 +278,16 @@ def load_model_from_config(config, ckpt, verbose=False): parser.add_argument( "--ms_mode", type=int, default=0, help="Running in GRAPH_MODE(0) or PYNATIVE_MODE(1) (default=0)" ) + parser.add_argument( + "--jit_level", + default="O2", + type=str, + choices=["O0", "O1", "O2"], + help="Used to control the compilation optimization level. Supports ['O0', 'O1', 'O2']." + "O0: Except for optimizations that may affect functionality, all other optimizations are turned off, adopt KernelByKernel execution mode." + "O1: Using commonly used optimizations and automatic operator fusion optimizations, adopt KernelByKernel execution mode." + "O2: Ultimate performance optimization, adopt Sink execution mode.", + ) parser.add_argument("--num_samples", type=int, default=4, help="num of total samples") parser.add_argument("--img_size", type=int, default=512, help="") parser.add_argument("--batch_size", type=int, default=4, help="batch size of model") diff --git a/examples/stable_diffusion_v2/ldm/modules/attention.py b/examples/stable_diffusion_v2/ldm/modules/attention.py index 3ebe2fac8b..84ac9c4f07 100644 --- a/examples/stable_diffusion_v2/ldm/modules/attention.py +++ b/examples/stable_diffusion_v2/ldm/modules/attention.py @@ -17,21 +17,13 @@ import numpy as np from ldm.util import is_old_ms_version -from packaging import version import mindspore as ms import mindspore.numpy as msnp from mindspore import nn, ops from mindspore.common.initializer import initializer -try: - from mindspore.nn.layer.flash_attention import FlashAttention - - FLASH_IS_AVAILABLE = True - print("flash attention is available.") -except ImportError: - FLASH_IS_AVAILABLE = False - print("flash attention is unavailable.") +from mindone.models.modules.flash_attention import FLASH_IS_AVAILABLE, MSFlashAttention logger = logging.getLogger() @@ -152,14 +144,12 @@ def __init__( enable_flash_attention and FLASH_IS_AVAILABLE and (ms.context.get_context("device_target") == "Ascend") ) if self.enable_flash_attention: - # TODO: how high_precision affect the training or inference quality - if version.parse(ms.__version__) <= version.parse("2.2.0"): - self.flash_attention = FlashAttention(head_dim=dim_head, high_precision=True) - self.fa_mask_dtype = ms.float16 # choose_flash_attention_dtype() - else: - self.flash_attention = FlashAttention(head_dim=dim_head, head_num=heads, high_precision=True) - self.fa_mask_dtype = ms.uint8 # choose_flash_attention_dtype() - # logger.info("Flash attention is enabled.") + self.flash_attention = MSFlashAttention( + head_dim=dim_head, + head_num=heads, + input_layout="BNSD", + dtype=dtype, + ) else: self.flash_attention = None @@ -207,8 +197,7 @@ def construct(self, x, context=None, mask=None): q = q.view(q_b, q_n, h, -1).transpose(0, 2, 1, 3) k = k.view(k_b, k_n, h, -1).transpose(0, 2, 1, 3) v = v.view(v_b, v_n, h, -1).transpose(0, 2, 1, 3) - if mask is None: - mask = ops.zeros((q_b, q_n, q_n), self.fa_mask_dtype) + # FIXME: a trick to pad sdv1.5 head dimensions from 160 to 256 if head_dim == 160: # pad to 2**n * 64 @@ -217,9 +206,7 @@ def construct(self, x, context=None, mask=None): k = msnp.pad(k, ((0, 0), (0, 0), (0, 0), (0, padding_size)), constant_value=0) v = msnp.pad(v, ((0, 0), (0, 0), (0, 0), (0, padding_size)), constant_value=0) - out = self.flash_attention( - q.to(ms.float16), k.to(ms.float16), v.to(ms.float16), mask.to(self.fa_mask_dtype) - ) + out = self.flash_attention(q, k, v, mask=mask) if head_dim == 160: out = ops.slice(out, [0, 0, 0, 0], [q_b, h, q_n, head_dim]) b, h, n, d = out.shape @@ -298,12 +285,8 @@ def rearange_frame_back(x): q = q.view(q_b, q_n, h, -1).transpose(0, 2, 1, 3) k = k.view(k_b, k_n, h, -1).transpose(0, 2, 1, 3) v = v.view(v_b, v_n, h, -1).transpose(0, 2, 1, 3) - if mask is None: - mask = ops.zeros((q_b, q_n, q_n), self.fa_mask_dtype) - out = self.flash_attention( - q.to(ms.float16), k.to(ms.float16), v.to(ms.float16), mask.to(self.fa_mask_dtype) - ) + out = self.flash_attention(q, k, v, mask=mask) b, h, n, d = out.shape # reshape FA output to original attn input format, (b h n d) -> (b n h*d) diff --git a/examples/stable_diffusion_v2/ldm/modules/train/ema.py b/examples/stable_diffusion_v2/ldm/modules/train/ema.py index 45d0d14fe1..f9000ae12a 100644 --- a/examples/stable_diffusion_v2/ldm/modules/train/ema.py +++ b/examples/stable_diffusion_v2/ldm/modules/train/ema.py @@ -33,9 +33,8 @@ def __init__(self, network, ema_decay=0.9999, updates=0, trainable_only=True, of self.hyper_map = C.HyperMap() self.map = ops.HyperMap() - if offloading: - self.assign = ops.Assign().add_prim_attr("primitive_target", "CPU") - else: + self.offloading = offloading + if not offloading: self.assign = ops.Assign() def ema_update(self): @@ -47,8 +46,16 @@ def ema_update(self): self.updates = F.depend(self.updates, success) return self.updates + def swap_data(self, ori_datas, tgt_datas): + for ori_data, tgt_data in zip(ori_datas, tgt_datas): + tgt_data.set_data(ori_data) + # @ms_function def swap_before_eval(self): + if self.offloading: + self.swap_data(self.net_weight, self.swap_cache) + self.swap_data(self.ema_weight, self.net_weight) + return True # net -> swap success = self.map(self.assign, self.swap_cache, self.net_weight) # ema -> net @@ -57,6 +64,9 @@ def swap_before_eval(self): # @ms_function def swap_after_eval(self): + if self.offloading: + self.swap_data(self.swap_cache, self.net_weight) + return True # swap -> net success = self.map(self.assign, self.net_weight, self.swap_cache) return success diff --git a/examples/stable_diffusion_v2/scripts/run_infer_lora.sh b/examples/stable_diffusion_v2/scripts/run_infer_lora.sh index a18907cfba..0e27f88df9 100644 --- a/examples/stable_diffusion_v2/scripts/run_infer_lora.sh +++ b/examples/stable_diffusion_v2/scripts/run_infer_lora.sh @@ -1,5 +1,4 @@ export DEVICE_ID=$1 -export MS_ENABLE_REF_MODE=1 python text_to_image.py \ --prompt "a painting of a tree with a mountain in the background and a person standing in the foreground with a snow covered ground" \ diff --git a/examples/stable_diffusion_v2/scripts/run_train_cldm.sh b/examples/stable_diffusion_v2/scripts/run_train_cldm.sh index 2b59b9887f..cacb3f3e7f 100644 --- a/examples/stable_diffusion_v2/scripts/run_train_cldm.sh +++ b/examples/stable_diffusion_v2/scripts/run_train_cldm.sh @@ -1,6 +1,4 @@ export DEVICE_ID=$1 -export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE" # debug -# export MS_ASCEND_CHECK_OVERFLOW_MODE=1 # debug task_name=train_cldm_canny_fill1k_e14_constant_ema output_dir=outputs/$task_name diff --git a/examples/stable_diffusion_v2/scripts/run_train_distributed.sh b/examples/stable_diffusion_v2/scripts/run_train_distributed.sh index c017ee44f6..cb150351c2 100644 --- a/examples/stable_diffusion_v2/scripts/run_train_distributed.sh +++ b/examples/stable_diffusion_v2/scripts/run_train_distributed.sh @@ -1,43 +1,15 @@ -# SD Parallel training via HCCL - -# Parallel config -num_devices=8 -# Please generate the rank table file via hccl_tools.py -# (https://gitee.com/mindspore/models/blob/master/utils/hccl_tools/hccl_tools.py) for your own server -rank_table_file=hccl_4p_01234567_127.0.0.1.json -CANDIDATE_DEVICE=(0 1 2 3 4 5 6 7) - -export DEVICE_NUM=$num_devices -export RANK_SIZE=$num_devices -export RANK_TABLE_FILE=$rank_table_file -echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}" - -# Training path config -data_path=datasets/pokemon_blip/train -pretrained_model_path=models/sd_v2_base-57526ee4.ckpt +export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 output_path=outputs task_name=train_xt2img +data_path=datasets/pokemon_blip/train -# parallel train -rm -rf ${output_path:?}/${task_name:?} -mkdir -p ${output_path:?}/${task_name:?} -cp $0 $output_path/. -#export MS_COMPILER_CACHE_PATH=${output_path:?}/${task_name:?} - -export SERVER_ID=0 -rank_start=$((DEVICE_NUM * SERVER_ID)) -for((i=0; i<${RANK_SIZE}; i++)) -do - export RANK_ID=$((rank_start + i)) - export DEVICE_ID=${CANDIDATE_DEVICE[i]} - mkdir -p ${output_path:?}/${task_name:?}/rank_$i - echo "start training for rank $RANK_ID, device $DEVICE_ID" - nohup python -u train_text_to_image.py \ +msrun --bind_core=True --master_port=8200 --worker_num=8 --local_worker_num=8 --log_dir=$output_path/$task_name \ + python train_text_to_image.py \ --train_config="configs/train/train_config_vanilla_v1.yaml" \ --data_path=$data_path \ + --pretrained_model_path="models/sd_v1.5-d0ab7146.ckpt" \ --output_path=$output_path/$task_name \ --use_parallel=True \ - > $output_path/$task_name/rank_$i/train.log 2>&1 & -done + --dataset_sink_mode=True diff --git a/examples/stable_diffusion_v2/scripts/run_train_lora.sh b/examples/stable_diffusion_v2/scripts/run_train_lora.sh index e14ecfd3dc..9c78249e57 100644 --- a/examples/stable_diffusion_v2/scripts/run_train_lora.sh +++ b/examples/stable_diffusion_v2/scripts/run_train_lora.sh @@ -1,9 +1,5 @@ export DEVICE_ID=$1 -# for non-INFNAN, keep drop overflow update False -# export MS_ASCEND_CHECK_OVERFLOW_MODE=1 -export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE" # debuggin - task_name=train_lora_sdv1 #rewrite output_path=outputs diff --git a/examples/stable_diffusion_v2/scripts/run_train_v2.sh b/examples/stable_diffusion_v2/scripts/run_train_v2.sh deleted file mode 100755 index e69de29bb2..0000000000 diff --git a/examples/stable_diffusion_v2/text_to_image.py b/examples/stable_diffusion_v2/text_to_image.py index e9ca72a43d..99a5c43f05 100644 --- a/examples/stable_diffusion_v2/text_to_image.py +++ b/examples/stable_diffusion_v2/text_to_image.py @@ -139,6 +139,7 @@ def main(args): args.ms_mode, seed=args.seed, distributed=args.use_parallel, + jit_level=args.jit_level, device_target=args.device_target, ) @@ -349,6 +350,16 @@ def main(args): parser.add_argument( "--ms_mode", type=int, default=0, help="Running in GRAPH_MODE(0) or PYNATIVE_MODE(1) (default=0)" ) + parser.add_argument( + "--jit_level", + default="O2", + type=str, + choices=["O0", "O1", "O2"], + help="Used to control the compilation optimization level. Supports ['O0', 'O1', 'O2']." + "O0: Except for optimizations that may affect functionality, all other optimizations are turned off, adopt KernelByKernel execution mode." + "O1: Using commonly used optimizations and automatic operator fusion optimizations, adopt KernelByKernel execution mode." + "O2: Ultimate performance optimization, adopt Sink execution mode.", + ) parser.add_argument("--use_parallel", default=False, type=str2bool, help="use parallel") parser.add_argument("--device_target", type=str, nargs="?", default="Ascend", help="Ascend, GPU") parser.add_argument( diff --git a/examples/stable_diffusion_v2/train_cldm.py b/examples/stable_diffusion_v2/train_cldm.py index eedd6ff11b..b8ff0c5441 100644 --- a/examples/stable_diffusion_v2/train_cldm.py +++ b/examples/stable_diffusion_v2/train_cldm.py @@ -98,6 +98,16 @@ def parse_args(): help="train config path to load a yaml file that override the default arguments", ) parser.add_argument("--mode", default=0, type=int, help="Specify the mode: 0 for graph mode, 1 for pynative mode") + parser.add_argument( + "--jit_level", + default="O2", + type=str, + choices=["O0", "O1", "O2"], + help="Used to control the compilation optimization level. Supports ['O0', 'O1', 'O2']." + "O0: Except for optimizations that may affect functionality, all other optimizations are turned off, adopt KernelByKernel execution mode." + "O1: Using commonly used optimizations and automatic operator fusion optimizations, adopt KernelByKernel execution mode." + "O2: Ultimate performance optimization, adopt Sink execution mode.", + ) parser.add_argument("--use_parallel", default=False, type=str2bool, help="use parallel") parser.add_argument("--use_recompute", default=None, type=str2bool, help="whether use recompute") parser.add_argument( @@ -223,6 +233,7 @@ def main(args): args.mode, seed=args.seed, distributed=args.use_parallel, + jit_level=args.jit_level, enable_modelarts=args.enable_modelarts, num_workers=args.num_workers, json_data_path=args.json_data_path, diff --git a/examples/stable_diffusion_v2/train_dreambooth.py b/examples/stable_diffusion_v2/train_dreambooth.py index ad90d364d7..b068fe4d6e 100644 --- a/examples/stable_diffusion_v2/train_dreambooth.py +++ b/examples/stable_diffusion_v2/train_dreambooth.py @@ -35,6 +35,20 @@ def init_env(args): set_random_seed(args.seed) ms.set_context(max_device_memory=args.max_device_memory) # TODO: why limit? ms.set_context(mode=args.mode) # needed for MS2.0 + if args.mode == ms.GRAPH_MODE: + try: + if args.jit_level in ["O0", "O1", "O2"]: + ms.set_context(jit_config={"jit_level": args.jit_level}) + logger.info(f"set jit_level: {args.jit_level}.") + else: + logger.warning( + f"Unsupport jit_level: {args.jit_level}. The framework automatically selects the execution method" + ) + except Exception: + logger.warning( + "The current jit_level is not suitable because current MindSpore version does not match," + "please ensure the MindSpore version >= ms2.3.0." + ) if args.use_parallel: init() device_id = int(os.getenv("DEVICE_ID")) @@ -86,6 +100,16 @@ def parse_args(): parser.add_argument("--unet_initialize_random", default=False, type=str2bool, help="initialize unet randomly") parser.add_argument("--dataset_sink_mode", default=False, type=str2bool, help="sink mode") parser.add_argument("--mode", default=0, type=int, help="Specify the mode: 0 for graph mode, 1 for pynative mode") + parser.add_argument( + "--jit_level", + default="O2", + type=str, + choices=["O0", "O1", "O2"], + help="Used to control the compilation optimization level. Supports ['O0', 'O1', 'O2']." + "O0: Except for optimizations that may affect functionality, all other optimizations are turned off, adopt KernelByKernel execution mode." + "O1: Using commonly used optimizations and automatic operator fusion optimizations, adopt KernelByKernel execution mode." + "O2: Ultimate performance optimization, adopt Sink execution mode.", + ) parser.add_argument("--use_parallel", default=False, type=str2bool, help="Enable parallel processing") parser.add_argument("--max_device_memory", type=str, default="30GB", help="e.g. `30GB` for 910a, `59GB` for 910b") parser.add_argument("--use_lora", default=False, type=str2bool, help="Enable LoRA finetuning") diff --git a/examples/stable_diffusion_v2/train_text_to_image.py b/examples/stable_diffusion_v2/train_text_to_image.py index d6003025aa..e7749c79f6 100644 --- a/examples/stable_diffusion_v2/train_text_to_image.py +++ b/examples/stable_diffusion_v2/train_text_to_image.py @@ -110,6 +110,16 @@ def parse_args(): help="train config path to load a yaml file that override the default arguments", ) parser.add_argument("--mode", default=0, type=int, help="Specify the mode: 0 for graph mode, 1 for pynative mode") + parser.add_argument( + "--jit_level", + default="O2", + type=str, + choices=["O0", "O1", "O2"], + help="Used to control the compilation optimization level. Supports ['O0', 'O1', 'O2']." + "O0: Except for optimizations that may affect functionality, all other optimizations are turned off, adopt KernelByKernel execution mode." + "O1: Using commonly used optimizations and automatic operator fusion optimizations, adopt KernelByKernel execution mode." + "O2: Ultimate performance optimization, adopt Sink execution mode.", + ) parser.add_argument("--use_parallel", default=False, type=str2bool, help="use parallel") parser.add_argument( "--replace_small_images", @@ -242,6 +252,7 @@ def main(args): args.mode, seed=args.seed, distributed=args.use_parallel, + jit_level=args.jit_level, enable_modelarts=args.enable_modelarts, num_workers=args.num_workers, json_data_path=args.json_data_path, diff --git a/examples/stable_diffusion_v2/train_textual_inversion.py b/examples/stable_diffusion_v2/train_textual_inversion.py index 471573c0df..09861d05da 100644 --- a/examples/stable_diffusion_v2/train_textual_inversion.py +++ b/examples/stable_diffusion_v2/train_textual_inversion.py @@ -55,6 +55,16 @@ def parse_args(): parser = argparse.ArgumentParser(description="A training script for dreambooth.") parser.add_argument("--mode", default=0, type=int, help="Specify the mode: 0 for graph mode, 1 for pynative mode") + parser.add_argument( + "--jit_level", + default="O2", + type=str, + choices=["O0", "O1", "O2"], + help="Used to control the compilation optimization level. Supports ['O0', 'O1', 'O2']." + "O0: Except for optimizations that may affect functionality, all other optimizations are turned off, adopt KernelByKernel execution mode." + "O1: Using commonly used optimizations and automatic operator fusion optimizations, adopt KernelByKernel execution mode." + "O2: Ultimate performance optimization, adopt Sink execution mode.", + ) parser.add_argument( "-v", "--version", @@ -278,6 +288,7 @@ def main(args): args.mode, seed=args.seed, distributed=args.use_parallel, + jit_level=args.jit_level, enable_modelarts=args.enable_modelarts, num_workers=args.num_workers, json_data_path=args.json_data_path, diff --git a/examples/stable_diffusion_v2/train_unclip_image_variation.py b/examples/stable_diffusion_v2/train_unclip_image_variation.py index c773ad9bfb..4227310de7 100644 --- a/examples/stable_diffusion_v2/train_unclip_image_variation.py +++ b/examples/stable_diffusion_v2/train_unclip_image_variation.py @@ -80,6 +80,16 @@ def parse_args(): help="train config path to load a yaml file that override the default arguments", ) parser.add_argument("--mode", default=0, type=int, help="Specify the mode: 0 for graph mode, 1 for pynative mode") + parser.add_argument( + "--jit_level", + default="O2", + type=str, + choices=["O0", "O1", "O2"], + help="Used to control the compilation optimization level. Supports ['O0', 'O1', 'O2']." + "O0: Except for optimizations that may affect functionality, all other optimizations are turned off, adopt KernelByKernel execution mode." + "O1: Using commonly used optimizations and automatic operator fusion optimizations, adopt KernelByKernel execution mode." + "O2: Ultimate performance optimization, adopt Sink execution mode.", + ) parser.add_argument("--use_parallel", default=False, type=str2bool, help="use parallel") parser.add_argument( "--replace_small_images", @@ -194,6 +204,7 @@ def main(args): args.mode, seed=args.seed, distributed=args.use_parallel, + jit_level=args.jit_level, enable_modelarts=args.enable_modelarts, num_workers=args.num_workers, json_data_path=args.json_data_path, diff --git a/examples/stable_diffusion_v2/unclip_image_variation.py b/examples/stable_diffusion_v2/unclip_image_variation.py index d9c2fc0c95..da27e2c04c 100644 --- a/examples/stable_diffusion_v2/unclip_image_variation.py +++ b/examples/stable_diffusion_v2/unclip_image_variation.py @@ -17,11 +17,11 @@ workspace = os.path.dirname(os.path.abspath(__file__)) sys.path.append(workspace) +from common import init_env from ldm.models.diffusion.ddim import DDIMSampler from ldm.models.diffusion.dpm_solver import DPMSolverSampler from ldm.modules.logger import set_logger from ldm.modules.lora import inject_trainable_lora -from ldm.modules.train.tools import set_random_seed from ldm.util import instantiate_from_config, str2bool from utils import model_utils from utils.download import download_checkpoint @@ -130,6 +130,14 @@ def load_image(image: Union[str, Image.Image]) -> ms.Tensor: def main(args): + # init + device_id, _, _ = init_env( + args.ms_mode, + seed=args.seed, + jit_level=args.jit_level, + max_device_memory="30GB", + ) + # set logger set_logger( name="", @@ -180,12 +188,6 @@ def main(args): os.makedirs(sample_path, exist_ok=True) base_count = len(os.listdir(sample_path)) - # set ms context - device_id = int(os.getenv("DEVICE_ID", 0)) - ms.context.set_context(mode=args.ms_mode, device_target="Ascend", device_id=device_id, max_device_memory="30GB") - - set_random_seed(args.seed) - # create model if not os.path.isabs(args.config): args.config = os.path.join(work_dir, args.config) @@ -320,6 +322,16 @@ def main(args): parser.add_argument( "--ms_mode", type=int, default=0, help="Running in GRAPH_MODE(0) or PYNATIVE_MODE(1) (default=0)" ) + parser.add_argument( + "--jit_level", + default="O2", + type=str, + choices=["O0", "O1", "O2"], + help="Used to control the compilation optimization level. Supports ['O0', 'O1', 'O2']." + "O0: Except for optimizations that may affect functionality, all other optimizations are turned off, adopt KernelByKernel execution mode." + "O1: Using commonly used optimizations and automatic operator fusion optimizations, adopt KernelByKernel execution mode." + "O2: Ultimate performance optimization, adopt Sink execution mode.", + ) parser.add_argument( "--data_path", type=str,