datasets

evaluate.py
324 строки · 11.0 Кб
Перенос по словам
1
"""Official evaluation script for SQuAD version 2.0.
2

3
In addition to basic functionality, we also compute additional statistics and
4
plot precision-recall curves if an additional na_prob.json file is provided.
5
This file is expected to map question ID's to the model's predicted probability
6
that a question is unanswerable.
7
"""
8

9
import argparse
10
import collections
11
import json
12
import os
13
import re
14
import string
15
import sys
16

17
import numpy as np
18

19

20
ARTICLES_REGEX = re.compile(r"\b(a|an|the)\b", re.UNICODE)
21

22
OPTS = None
23

24

25
def parse_args():
26
    parser = argparse.ArgumentParser("Official evaluation script for SQuAD version 2.0.")
27
    parser.add_argument("data_file", metavar="data.json", help="Input data JSON file.")
28
    parser.add_argument("pred_file", metavar="pred.json", help="Model predictions.")
29
    parser.add_argument(
30
        "--out-file", "-o", metavar="eval.json", help="Write accuracy metrics to file (default is stdout)."
31
    )
32
    parser.add_argument(
33
        "--na-prob-file", "-n", metavar="na_prob.json", help="Model estimates of probability of no answer."
34
    )
35
    parser.add_argument(
36
        "--na-prob-thresh",
37
        "-t",
38
        type=float,
39
        default=1.0,
40
        help='Predict "" if no-answer probability exceeds this (default = 1.0).',
41
    )
42
    parser.add_argument(
43
        "--out-image-dir", "-p", metavar="out_images", default=None, help="Save precision-recall curves to directory."
44
    )
45
    parser.add_argument("--verbose", "-v", action="store_true")
46
    if len(sys.argv) == 1:
47
        parser.print_help()
48
        sys.exit(1)
49
    return parser.parse_args()
50

51

52
def make_qid_to_has_ans(dataset):
53
    qid_to_has_ans = {}
54
    for article in dataset:
55
        for p in article["paragraphs"]:
56
            for qa in p["qas"]:
57
                qid_to_has_ans[qa["id"]] = bool(qa["answers"]["text"])
58
    return qid_to_has_ans
59

60

61
def normalize_answer(s):
62
    """Lower text and remove punctuation, articles and extra whitespace."""
63

64
    def remove_articles(text):
65
        return ARTICLES_REGEX.sub(" ", text)
66

67
    def white_space_fix(text):
68
        return " ".join(text.split())
69

70
    def remove_punc(text):
71
        exclude = set(string.punctuation)
72
        return "".join(ch for ch in text if ch not in exclude)
73

74
    def lower(text):
75
        return text.lower()
76

77
    return white_space_fix(remove_articles(remove_punc(lower(s))))
78

79

80
def get_tokens(s):
81
    if not s:
82
        return []
83
    return normalize_answer(s).split()
84

85

86
def compute_exact(a_gold, a_pred):
87
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
88

89

90
def compute_f1(a_gold, a_pred):
91
    gold_toks = get_tokens(a_gold)
92
    pred_toks = get_tokens(a_pred)
93
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
94
    num_same = sum(common.values())
95
    if len(gold_toks) == 0 or len(pred_toks) == 0:
96
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
97
        return int(gold_toks == pred_toks)
98
    if num_same == 0:
99
        return 0
100
    precision = 1.0 * num_same / len(pred_toks)
101
    recall = 1.0 * num_same / len(gold_toks)
102
    f1 = (2 * precision * recall) / (precision + recall)
103
    return f1
104

105

106
def get_raw_scores(dataset, preds):
107
    exact_scores = {}
108
    f1_scores = {}
109
    for article in dataset:
110
        for p in article["paragraphs"]:
111
            for qa in p["qas"]:
112
                qid = qa["id"]
113
                gold_answers = [t for t in qa["answers"]["text"] if normalize_answer(t)]
114
                if not gold_answers:
115
                    # For unanswerable questions, only correct answer is empty string
116
                    gold_answers = [""]
117
                if qid not in preds:
118
                    print(f"Missing prediction for {qid}")
119
                    continue
120
                a_pred = preds[qid]
121
                # Take max over all gold answers
122
                exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
123
                f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
124
    return exact_scores, f1_scores
125

126

127
def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
128
    new_scores = {}
129
    for qid, s in scores.items():
130
        pred_na = na_probs[qid] > na_prob_thresh
131
        if pred_na:
132
            new_scores[qid] = float(not qid_to_has_ans[qid])
133
        else:
134
            new_scores[qid] = s
135
    return new_scores
136

137

138
def make_eval_dict(exact_scores, f1_scores, qid_list=None):
139
    if not qid_list:
140
        total = len(exact_scores)
141
        return collections.OrderedDict(
142
            [
143
                ("exact", 100.0 * sum(exact_scores.values()) / total),
144
                ("f1", 100.0 * sum(f1_scores.values()) / total),
145
                ("total", total),
146
            ]
147
        )
148
    else:
149
        total = len(qid_list)
150
        return collections.OrderedDict(
151
            [
152
                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
153
                ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
154
                ("total", total),
155
            ]
156
        )
157

158

159
def merge_eval(main_eval, new_eval, prefix):
160
    for k in new_eval:
161
        main_eval[f"{prefix}_{k}"] = new_eval[k]
162

163

164
def plot_pr_curve(precisions, recalls, out_image, title):
165
    plt.step(recalls, precisions, color="b", alpha=0.2, where="post")
166
    plt.fill_between(recalls, precisions, step="post", alpha=0.2, color="b")
167
    plt.xlabel("Recall")
168
    plt.ylabel("Precision")
169
    plt.xlim([0.0, 1.05])
170
    plt.ylim([0.0, 1.05])
171
    plt.title(title)
172
    plt.savefig(out_image)
173
    plt.clf()
174

175

176
def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans, out_image=None, title=None):
177
    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
178
    true_pos = 0.0
179
    cur_p = 1.0
180
    cur_r = 0.0
181
    precisions = [1.0]
182
    recalls = [0.0]
183
    avg_prec = 0.0
184
    for i, qid in enumerate(qid_list):
185
        if qid_to_has_ans[qid]:
186
            true_pos += scores[qid]
187
        cur_p = true_pos / float(i + 1)
188
        cur_r = true_pos / float(num_true_pos)
189
        if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i + 1]]:
190
            # i.e., if we can put a threshold after this point
191
            avg_prec += cur_p * (cur_r - recalls[-1])
192
            precisions.append(cur_p)
193
            recalls.append(cur_r)
194
    if out_image:
195
        plot_pr_curve(precisions, recalls, out_image, title)
196
    return {"ap": 100.0 * avg_prec}
197

198

199
def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, qid_to_has_ans, out_image_dir):
200
    if out_image_dir and not os.path.exists(out_image_dir):
201
        os.makedirs(out_image_dir)
202
    num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
203
    if num_true_pos == 0:
204
        return
205
    pr_exact = make_precision_recall_eval(
206
        exact_raw,
207
        na_probs,
208
        num_true_pos,
209
        qid_to_has_ans,
210
        out_image=os.path.join(out_image_dir, "pr_exact.png"),
211
        title="Precision-Recall curve for Exact Match score",
212
    )
213
    pr_f1 = make_precision_recall_eval(
214
        f1_raw,
215
        na_probs,
216
        num_true_pos,
217
        qid_to_has_ans,
218
        out_image=os.path.join(out_image_dir, "pr_f1.png"),
219
        title="Precision-Recall curve for F1 score",
220
    )
221
    oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
222
    pr_oracle = make_precision_recall_eval(
223
        oracle_scores,
224
        na_probs,
225
        num_true_pos,
226
        qid_to_has_ans,
227
        out_image=os.path.join(out_image_dir, "pr_oracle.png"),
228
        title="Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)",
229
    )
230
    merge_eval(main_eval, pr_exact, "pr_exact")
231
    merge_eval(main_eval, pr_f1, "pr_f1")
232
    merge_eval(main_eval, pr_oracle, "pr_oracle")
233

234

235
def histogram_na_prob(na_probs, qid_list, image_dir, name):
236
    if not qid_list:
237
        return
238
    x = [na_probs[k] for k in qid_list]
239
    weights = np.ones_like(x) / float(len(x))
240
    plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
241
    plt.xlabel("Model probability of no-answer")
242
    plt.ylabel("Proportion of dataset")
243
    plt.title(f"Histogram of no-answer probability: {name}")
244
    plt.savefig(os.path.join(image_dir, f"na_prob_hist_{name}.png"))
245
    plt.clf()
246

247

248
def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
249
    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
250
    cur_score = num_no_ans
251
    best_score = cur_score
252
    best_thresh = 0.0
253
    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
254
    for i, qid in enumerate(qid_list):
255
        if qid not in scores:
256
            continue
257
        if qid_to_has_ans[qid]:
258
            diff = scores[qid]
259
        else:
260
            if preds[qid]:
261
                diff = -1
262
            else:
263
                diff = 0
264
        cur_score += diff
265
        if cur_score > best_score:
266
            best_score = cur_score
267
            best_thresh = na_probs[qid]
268
    return 100.0 * best_score / len(scores), best_thresh
269

270

271
def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
272
    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
273
    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
274
    main_eval["best_exact"] = best_exact
275
    main_eval["best_exact_thresh"] = exact_thresh
276
    main_eval["best_f1"] = best_f1
277
    main_eval["best_f1_thresh"] = f1_thresh
278

279

280
def main():
281
    with open(OPTS.data_file) as f:
282
        dataset_json = json.load(f)
283
        dataset = dataset_json["data"]
284
    with open(OPTS.pred_file) as f:
285
        preds = json.load(f)
286
    if OPTS.na_prob_file:
287
        with open(OPTS.na_prob_file) as f:
288
            na_probs = json.load(f)
289
    else:
290
        na_probs = {k: 0.0 for k in preds}
291
    qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
292
    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
293
    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
294
    exact_raw, f1_raw = get_raw_scores(dataset, preds)
295
    exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, OPTS.na_prob_thresh)
296
    f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, OPTS.na_prob_thresh)
297
    out_eval = make_eval_dict(exact_thresh, f1_thresh)
298
    if has_ans_qids:
299
        has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
300
        merge_eval(out_eval, has_ans_eval, "HasAns")
301
    if no_ans_qids:
302
        no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
303
        merge_eval(out_eval, no_ans_eval, "NoAns")
304
    if OPTS.na_prob_file:
305
        find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
306
    if OPTS.na_prob_file and OPTS.out_image_dir:
307
        run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, qid_to_has_ans, OPTS.out_image_dir)
308
        histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, "hasAns")
309
        histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, "noAns")
310
    if OPTS.out_file:
311
        with open(OPTS.out_file, "w") as f:
312
            json.dump(out_eval, f)
313
    else:
314
        print(json.dumps(out_eval, indent=2))
315

316

317
if __name__ == "__main__":
318
    OPTS = parse_args()
319
    if OPTS.out_image_dir:
320
        import matplotlib
321

322
        matplotlib.use("Agg")
323
        import matplotlib.pyplot as plt
324
    main()
325
datasets

Использование cookies