OpenDelta
30 строк · 1.3 Кб
1import numpy as np
2import re
3
4def round_stsb_target(label):
5"""STSB maps two sentences to a floating point number between 1 and 5
6representing their semantic similarity. Since we are treating all tasks as
7text-to-text tasks we need to convert this floating point number to a string.
8The vast majority of the similarity score labels in STSB are in the set
9[0, 0.2, 0.4, ..., 4.8, 5.0]. So, we first round the number to the closest
10entry in this set, and then we convert the result to a string (literally e.g.
11"3.4"). This converts STSB roughly into a 26-class classification dataset.
12Args:
13label: original label.
14Returns:
15A preprocessed label.
16"""
17return np.round((label * 5) / 5, decimals=1)
18
19
20def pad_punctuation(text):
21"""Re-implementation of _pad_punctuation in t5. This function adds spaces
22around punctuation. While this pads punctuation as expected, it has the
23unexpected effected of padding certain unicode characters with accents, with
24spaces as well. For instance: "François" becomes "Fran ç ois"""
25# Pad everything except for: underscores (_), whitespace (\s),
26# numbers (\p{N}), letters (\p{L}) and accent characters (\p{M}).
27text = re.sub(r'([^_\s\p{N}\p{L}\p{M}])', r' \1 ', text)
28# Collapse consecutive whitespace into one space.
29text = re.sub(r'\s+', ' ', text)
30return text