Skip to content

Commit

Permalink
use concat instead of interleave
Browse files Browse the repository at this point in the history
  • Loading branch information
wq2012 committed Jun 30, 2024
1 parent f975891 commit 884ac1e
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 12 deletions.
2 changes: 1 addition & 1 deletion DiarizationLM/unsloth/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# DataPrep
TRAINING_INPUT = {
"FISHER": ("/YOUT_DATA_PATH/FISHER_ENGLISH_TRAIN_FULL.json", 1.0),
"FISHER": ("/YOUT_DATA_PATH/FISHER_ENGLISH_TRAIN_FULL.json", 1),
}
EMIT_INPUT_LENGTH = 6000
EMIT_TARGET_LENGTH = 6000
Expand Down
16 changes: 5 additions & 11 deletions DiarizationLM/unsloth/dataprep.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import config
from datasets import Dataset, disable_caching, interleave_datasets
from datasets import Dataset, disable_caching, concatenate_datasets
from diarizationlm import utils


Expand Down Expand Up @@ -33,22 +33,16 @@ def build_dataset_single_source(input_file: str):
)
dataset1 = Dataset.from_generator(reader_hyp2ora.generate_data_dict)
dataset2 = Dataset.from_generator(reader_deg2ref.generate_data_dict)
return [dataset1, dataset2]
return concatenate_datasets([dataset1, dataset2])


def build_dataset():
disable_caching()
all_datasets = []
all_probs = []
for data_name in config.TRAINING_INPUT:
data_path, data_prob = config.TRAINING_INPUT[data_name]
all_datasets.extend(build_dataset_single_source(data_path))
all_probs.extend([data_prob] * 2)
prob_sum = sum(all_probs)
all_probs = [prob / prob_sum for prob in all_probs]
dataset = interleave_datasets(all_datasets,
probabilities=all_probs,
stopping_strategy="all_exhausted")
data_path, data_repeat = config.TRAINING_INPUT[data_name]
all_datasets.extend([build_dataset_single_source(data_path)] * data_repeat)
dataset = concatenate_datasets(all_datasets)
dataset = dataset.shuffle(seed=42)
dataset = dataset.map(formatting_prompts_func)
return dataset

0 comments on commit 884ac1e

Please sign in to comment.