Skip to content

Commit

Permalink
Merge pull request #639 from FunAudioLLM/dev/lyuxiang.lx
Browse files Browse the repository at this point in the history
use stream read to save memory
  • Loading branch information
aluminumbox authored Nov 11, 2024
2 parents 6d22d0b + 5ed5bb1 commit 7701325
Showing 1 changed file with 12 additions and 11 deletions.
23 changes: 12 additions & 11 deletions cosyvoice/dataset/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,18 @@ def parquet_opener(data, mode='train', tts_data={}):
assert 'src' in sample
url = sample['src']
try:
df = pq.read_table(url).to_pandas()
for i in range(len(df)):
if mode == 'inference' and df.loc[i, 'utt'] not in tts_data:
continue
sample.update(dict(df.loc[i]))
if mode == 'train':
# NOTE do not return sample directly, must initialize a new dict
yield {**sample}
else:
for index, text in enumerate(tts_data[df.loc[i, 'utt']]):
yield {**sample, 'tts_index': index, 'tts_text': text}
for df in pq.ParquetFile(url).iter_batches(batch_size=64):
df = df.to_pandas()
for i in range(len(df)):
if mode == 'inference' and df.loc[i, 'utt'] not in tts_data:
continue
sample.update(dict(df.loc[i]))
if mode == 'train':
# NOTE do not return sample directly, must initialize a new dict
yield {**sample}
else:
for index, text in enumerate(tts_data[df.loc[i, 'utt']]):
yield {**sample, 'tts_index': index, 'tts_text': text}
except Exception as ex:
logging.warning('Failed to open {}, ex info {}'.format(url, ex))

Expand Down

0 comments on commit 7701325

Please sign in to comment.