Skip to content

Commit

Permalink
[perf] reduce memory usage when processing data
Browse files Browse the repository at this point in the history
  • Loading branch information
jonatasgrosman committed Aug 30, 2023
1 parent d447203 commit e32b415
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion huggingsound/speech_recognition/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations
import os
import sys
import shutil
import torch
import warnings
import logging
Expand Down Expand Up @@ -267,12 +268,20 @@ def _get_dataset(self, processor: Wav2Vec2Processor, text_normalizer: Callable[[
else:
logger.info("Converting data format...")
dataset = get_dataset_from_dict_list(data)

if data_cache_dir is not None:
logger.info("Caching raw data...")
dataset.save_to_disk(f"{data_cache_dir}_raw")
dataset = load_from_disk(f"{data_cache_dir}_raw")

logger.info("Preparing data input and labels...")
dataset = self._prepare_dataset_for_finetuning(dataset, processor, text_normalizer, length_column_name, num_workers)

if data_cache_dir is not None:
logger.info("Caching data...")
logger.info("Caching processed data...")
dataset.save_to_disk(data_cache_dir)
logger.info("Removing raw data cache...")
shutil.rmtree(f"{data_cache_dir}_raw")

return dataset

Expand Down

0 comments on commit e32b415

Please sign in to comment.