diff --git a/config/high_order_interpolation.yaml b/config/high_order_interpolation.yaml index 727cab2..8d5d2de 100644 --- a/config/high_order_interpolation.yaml +++ b/config/high_order_interpolation.yaml @@ -48,6 +48,7 @@ max_epochs: 100 accelerator: cuda batch_size: 256 gradient_clip: null # 5.0 +accumulate_grad_batches: 1 # Are you training? Otherwise plot the result train: True diff --git a/config/net/transformer.yaml b/config/net/transformer.yaml index 65fbaed..42efba6 100644 --- a/config/net/transformer.yaml +++ b/config/net/transformer.yaml @@ -9,30 +9,41 @@ normalize: true # number of Fourier components. model_type: high_order_transformer n: 3 -input_segments: 64 + segments: 2 -# Layers in the form [input, output, segments] +base_width: 64 # attention blocks -#layers: [[10, 10, 64], [10, 5, 2], [5, 5, 2]] inner: 10 -#layers: [[1, ${inner}, 128], [${inner}, ${inner}, 10], [${inner}, ${inner}, 10]] + layers: - [ - { - "input": 1, - "hidden": 16, - "output": 16, - "layers": 1, - "segments": 16, - "input_segments": 128, - }, - { "input": 16, "output": 16, "segments": 16 }, - { "input": 16, "output": 16, "segments": 16 }, - { "input": 16, "output": 16, "segments": 16 }, - { "input": 16, "output": 16, "segments": 16 }, - { "input": 16, "hidden": 100, "layers": 1, "segments": 10 }, - ] + - input: 1 + hidden: 16 + output: 16 + layers: 1 + segments: 16 + input_segments: 128 + + - input: 16 + output: ${net.base_width} + segments: ${net.segments} + + - input: ${net.base_width} + output: ${net.base_width} + segments: ${net.segments} + + - input: ${net.base_width} + output: ${net.base_width} + segments: ${net.segments} + + - input: ${net.base_width} + output: ${net.base_width} + segments: ${net.segments} + + - input: ${net.base_width} + hidden: 100 + layers: 1 + segments: 10 # Note! output dimension is c=heads*output so different than normal # And then similarity size is c*c*batch_size diff --git a/examples/high_order_interpolation.py b/examples/high_order_interpolation.py index 28bb3e5..e866de3 100644 --- a/examples/high_order_interpolation.py +++ b/examples/high_order_interpolation.py @@ -27,7 +27,9 @@ logging.getLogger().setLevel(logging.DEBUG) -@hydra.main(config_path="../config", config_name="high_order_interpolation", version_base="1.3") +@hydra.main( + config_path="../config", config_name="high_order_interpolation", version_base="1.3" +) def run_language_interpolation(cfg: DictConfig): logger.info(OmegaConf.to_yaml(cfg)) logger.info("Working directory : {}".format(os.getcwd())) @@ -36,7 +38,6 @@ def run_language_interpolation(cfg: DictConfig): create_gutenberg_cache(parent_directory=hydra.utils.get_original_cwd()) if cfg.train is True: - try: # Try is needed for multirun case if cfg.data.type in dataset_registry: dataset_generator = dataset_registry[cfg.data.type] @@ -46,7 +47,6 @@ def run_language_interpolation(cfg: DictConfig): ) if cfg.net.model_type == "high_order_transformer": - # dataset_generator is only one type so using the default datamodule = TransformerDataModule( characters_per_feature=cfg.data.characters_per_feature, @@ -101,6 +101,7 @@ def run_language_interpolation(cfg: DictConfig): max_epochs=cfg.max_epochs, accelerator=cfg.accelerator, gradient_clip_val=cfg.gradient_clip, + accumulate_grad_batches=cfg.accumulate_grad_batches, ) model = ASCIIPredictionNet(cfg) diff --git a/language_interpolation/networks.py b/language_interpolation/networks.py index c6e01a3..5b6cc43 100644 --- a/language_interpolation/networks.py +++ b/language_interpolation/networks.py @@ -561,11 +561,14 @@ def select_network(cfg: DictConfig, device: str = None): if cfg.initialize.type == "linear": logger.info("Performing linear initialization") + start_init = time.perf_counter() initialize_network_polynomial_layers( model, max_slope=cfg.initialize.max_slope, max_offset=cfg.initialize.max_offset, ) + finish_time = time.perf_counter() - start_init + logger.info(f"Finished linear initialization {finish_time}") return model