1
# Copyright 2022 The HuggingFace Datasets Authors.
3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
7
# http://www.apache.org/licenses/LICENSE-2.0
9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
14
"""XTREME-S benchmark metric."""
16
from typing import List
18
from packaging import version
19
from sklearn.metrics import f1_score
22
from datasets.config import PY_VERSION
25
if PY_VERSION < version.parse("3.8"):
26
import importlib_metadata
28
import importlib.metadata as importlib_metadata
36
XTREME-S is a benchmark to evaluate universal cross-lingual speech representations in many languages.
37
XTREME-S covers four task families: speech recognition, classification, speech-to-text translation and retrieval.
40
_KWARGS_DESCRIPTION = """
41
Compute XTREME-S evaluation metric associated to each XTREME-S dataset.
43
predictions: list of predictions to score.
44
Each translation should be tokenized into a list of tokens.
45
references: list of lists of references for each translation.
46
Each reference should be tokenized into a list of tokens.
47
bleu_kwargs: optional dict of keywords to be passed when computing 'bleu'.
48
Keywords include Dict can be one of 'smooth_method', 'smooth_value', 'force', 'lowercase',
49
'tokenize', 'use_effective_order'.
50
wer_kwargs: optional dict of keywords to be passed when computing 'wer' and 'cer'.
51
Keywords include 'concatenate_texts'.
52
Returns: depending on the XTREME-S task, one or several of:
53
"accuracy": Accuracy - for 'fleurs-lang_id', 'minds14'
54
"f1": F1 score - for 'minds14'
55
"wer": Word error rate - for 'mls', 'fleurs-asr', 'voxpopuli', 'babel'
56
"cer": Character error rate - for 'mls', 'fleurs-asr', 'voxpopuli', 'babel'
57
"bleu": BLEU score according to the `sacrebleu` metric - for 'covost2'
60
>>> xtreme_s_metric = datasets.load_metric('xtreme_s', 'mls') # 'mls', 'voxpopuli', 'fleurs-asr' or 'babel'
61
>>> references = ["it is sunny here", "paper and pen are essentials"]
62
>>> predictions = ["it's sunny", "paper pen are essential"]
63
>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
64
>>> print({k: round(v, 2) for k, v in results.items()})
65
{'wer': 0.56, 'cer': 0.27}
67
>>> xtreme_s_metric = datasets.load_metric('xtreme_s', 'covost2')
68
>>> references = ["bonjour paris", "il est necessaire de faire du sport de temps en temp"]
69
>>> predictions = ["bonjour paris", "il est important de faire du sport souvent"]
70
>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
71
>>> print({k: round(v, 2) for k, v in results.items()})
74
>>> xtreme_s_metric = datasets.load_metric('xtreme_s', 'fleurs-lang_id')
75
>>> references = [0, 1, 0, 0, 1]
76
>>> predictions = [0, 1, 1, 0, 0]
77
>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
78
>>> print({k: round(v, 2) for k, v in results.items()})
81
>>> xtreme_s_metric = datasets.load_metric('xtreme_s', 'minds14')
82
>>> references = [0, 1, 0, 0, 1]
83
>>> predictions = [0, 1, 1, 0, 0]
84
>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
85
>>> print({k: round(v, 2) for k, v in results.items()})
86
{'f1': 0.58, 'accuracy': 0.6}
89
_CONFIG_NAMES = ["fleurs-asr", "mls", "voxpopuli", "babel", "covost2", "fleurs-lang_id", "minds14"]
90
SENTENCE_DELIMITER = ""
93
from jiwer import transforms as tr
95
_jiwer_available = True
97
_jiwer_available = False
99
if _jiwer_available and version.parse(importlib_metadata.version("jiwer")) < version.parse("2.3.0"):
101
class SentencesToListOfCharacters(tr.AbstractTransform):
102
def __init__(self, sentence_delimiter: str = " "):
103
self.sentence_delimiter = sentence_delimiter
105
def process_string(self, s: str):
108
def process_list(self, inp: List[str]):
110
for sent_idx, sentence in enumerate(inp):
111
chars.extend(self.process_string(sentence))
112
if self.sentence_delimiter is not None and self.sentence_delimiter != "" and sent_idx < len(inp) - 1:
113
chars.append(self.sentence_delimiter)
116
cer_transform = tr.Compose(
117
[tr.RemoveMultipleSpaces(), tr.Strip(), SentencesToListOfCharacters(SENTENCE_DELIMITER)]
119
elif _jiwer_available:
120
cer_transform = tr.Compose(
122
tr.RemoveMultipleSpaces(),
124
tr.ReduceToSingleSentence(SENTENCE_DELIMITER),
125
tr.ReduceToListOfListOfChars(),
132
def simple_accuracy(preds, labels):
133
return float((preds == labels).mean())
136
def f1_and_simple_accuracy(preds, labels):
138
"f1": float(f1_score(y_true=labels, y_pred=preds, average="macro")),
139
"accuracy": simple_accuracy(preds, labels),
151
use_effective_order=False,
153
# xtreme-s can only have one label
154
labels = [[label] for label in labels]
157
import sacrebleu as scb
160
"sacrebleu has to be installed in order to apply the bleu metric for covost2."
161
"You can install it via `pip install sacrebleu`."
164
if version.parse(scb.__version__) < version.parse("1.4.12"):
166
"To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
167
'You can install it with `pip install "sacrebleu>=1.4.12"`.'
170
references_per_prediction = len(labels[0])
171
if any(len(refs) != references_per_prediction for refs in labels):
172
raise ValueError("Sacrebleu requires the same number of references for each prediction")
173
transformed_references = [[refs[i] for refs in labels] for i in range(references_per_prediction)]
174
output = scb.corpus_bleu(
176
transformed_references,
177
smooth_method=smooth_method,
178
smooth_value=smooth_value,
181
use_effective_order=use_effective_order,
182
**({"tokenize": tokenize} if tokenize else {}),
184
return {"bleu": output.score}
187
def wer_and_cer(preds, labels, concatenate_texts, config_name):
189
from jiwer import compute_measures
192
f"jiwer has to be installed in order to apply the wer metric for {config_name}."
193
"You can install it via `pip install jiwer`."
196
if concatenate_texts:
197
wer = compute_measures(labels, preds)["wer"]
199
cer = compute_measures(labels, preds, truth_transform=cer_transform, hypothesis_transform=cer_transform)["wer"]
200
return {"wer": wer, "cer": cer}
203
def compute_score(preds, labels, score_type="wer"):
206
for prediction, reference in zip(preds, labels):
207
if score_type == "wer":
208
measures = compute_measures(reference, prediction)
209
elif score_type == "cer":
210
measures = compute_measures(
211
reference, prediction, truth_transform=cer_transform, hypothesis_transform=cer_transform
213
incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"]
214
total += measures["substitutions"] + measures["deletions"] + measures["hits"]
215
return incorrect / total
217
return {"wer": compute_score(preds, labels, "wer"), "cer": compute_score(preds, labels, "cer")}
220
@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
221
class XtremeS(datasets.Metric):
223
if self.config_name not in _CONFIG_NAMES:
224
raise KeyError(f"You should supply a configuration name selected in {_CONFIG_NAMES}")
226
pred_type = "int64" if self.config_name in ["fleurs-lang_id", "minds14"] else "string"
228
return datasets.MetricInfo(
229
description=_DESCRIPTION,
231
inputs_description=_KWARGS_DESCRIPTION,
232
features=datasets.Features(
233
{"predictions": datasets.Value(pred_type), "references": datasets.Value(pred_type)}
240
def _compute(self, predictions, references, bleu_kwargs=None, wer_kwargs=None):
241
bleu_kwargs = bleu_kwargs if bleu_kwargs is not None else {}
242
wer_kwargs = wer_kwargs if wer_kwargs is not None else {}
244
if self.config_name == "fleurs-lang_id":
245
return {"accuracy": simple_accuracy(predictions, references)}
246
elif self.config_name == "minds14":
247
return f1_and_simple_accuracy(predictions, references)
248
elif self.config_name == "covost2":
249
smooth_method = bleu_kwargs.pop("smooth_method", "exp")
250
smooth_value = bleu_kwargs.pop("smooth_value", None)
251
force = bleu_kwargs.pop("force", False)
252
lowercase = bleu_kwargs.pop("lowercase", False)
253
tokenize = bleu_kwargs.pop("tokenize", None)
254
use_effective_order = bleu_kwargs.pop("use_effective_order", False)
258
smooth_method=smooth_method,
259
smooth_value=smooth_value,
263
use_effective_order=use_effective_order,
265
elif self.config_name in ["fleurs-asr", "mls", "voxpopuli", "babel"]:
266
concatenate_texts = wer_kwargs.pop("concatenate_texts", False)
267
return wer_and_cer(predictions, references, concatenate_texts, self.config_name)
269
raise KeyError(f"You should supply a configuration name selected in {_CONFIG_NAMES}")