OpenDelta

utils.py
30 строк · 1.3 Кб
Перенос по словам
1
import numpy as np
2
import re
3

4
def round_stsb_target(label):
5
    """STSB maps two sentences to a floating point number between 1 and 5
6
    representing their semantic similarity. Since we are treating all tasks as
7
    text-to-text tasks we need to convert this floating point number to a string.
8
    The vast majority of the similarity score labels in STSB are in the set
9
    [0, 0.2, 0.4, ..., 4.8, 5.0]. So, we first round the number to the closest
10
    entry in this set, and then we convert the result to a string (literally e.g.
11
    "3.4"). This converts STSB roughly into a 26-class classification dataset.
12
    Args:
13
      label: original label.
14
    Returns:
15
      A preprocessed label.
16
    """
17
    return np.round((label * 5) / 5, decimals=1)
18

19

20
def pad_punctuation(text):
21
   """Re-implementation of _pad_punctuation in t5. This function adds spaces
22
   around punctuation. While this pads punctuation as expected, it has the
23
   unexpected effected of padding certain unicode characters with accents, with
24
   spaces as well. For instance: "François" becomes "Fran ç ois"""
25
   # Pad everything except for: underscores (_), whitespace (\s),
26
   # numbers (\p{N}), letters (\p{L}) and accent characters (\p{M}).
27
   text = re.sub(r'([^_\s\p{N}\p{L}\p{M}])', r' \1 ', text)
28
   # Collapse consecutive whitespace into one space.
29
   text = re.sub(r'\s+', ' ', text)
30
   return text
OpenDelta

Использование cookies