Skip to content

Commit

Permalink
Minor refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
clebert committed Oct 22, 2023
1 parent 286c9dd commit 5866769
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions convert_hf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,19 +149,19 @@ def write_checkpoint_file():


def write_tokenizer_file():
sp_model = SentencePieceProcessor(
model = SentencePieceProcessor(
model_file=os.path.join(args.input_model_path, "tokenizer.model")
)

words, scores = [], []

for token in range(sp_model.vocab_size()):
word = sp_model.id_to_piece(token)
score = sp_model.get_score(token)
for token in range(model.vocab_size()):
word = model.id_to_piece(token)
score = model.get_score(token)

if token == sp_model.bos_id():
if token == model.bos_id():
word = "\n<s>\n"
elif token == sp_model.eos_id():
elif token == model.eos_id():
word = "\n</s>\n"

words.append(word.replace("▁", " ").encode("utf-8"))
Expand Down

0 comments on commit 5866769

Please sign in to comment.