-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
33 lines (28 loc) · 1.42 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# based on https://github.com/AkariAsai/ATTEMPT
import regex as re
import numpy as np
def pad_punctuation(text):
"""Re-implementation of _pad_punctuation in t5. This function adds spaces
around punctuation. While this pads punctuation as expected, it has the
unexpected effected of padding certain unicode characters with accents, with
spaces as well. For instance: "François" becomes "Fran ç ois"""
# Pad everything except for: underscores (_), whitespace (\s),
# numbers (\p{N}), letters (\p{L}) and accent characters (\p{M}).
text = re.sub(r"([^_\s\p{N}\p{L}\p{M}])", r" \1 ", str(text))
# Collapse consecutive whitespace into one space.
text = re.sub(r"\s+", " ", text)
return text
def round_stsb_target(label):
"""STSB maps two sentences to a floating point number between 1 and 5
representing their semantic similarity. Since we are treating all tasks as
text-to-text tasks we need to convert this floating point number to a string.
The vast majority of the similarity score labels in STSB are in the set
[0, 0.2, 0.4, ..., 4.8, 5.0]. So, we first round the number to the closest
entry in this set, and then we convert the result to a string (literally e.g.
"3.4"). This converts STSB roughly into a 26-class classification dataset.
Args:
label: original label.
Returns:
A preprocessed label.
"""
return np.round((label * 5) / 5, decimals=1)