google-research
72 строки · 2.0 Кб
1# coding=utf-8
2# Copyright 2024 The Google Research Authors.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16r"""Collects basic stats for training and test splits from the results file.
17
18Example:
19--------
20LANGUAGE=...
21cat data/ngrams/results/reading/00/baselines/${LANGUAGE}.*.tsv > /tmp/${LANGUAGE}.tsv
22python describe_splits.py \
23--results_tsv_file /tmp/${LANGUAGE}.tsv
24
25Dependencies:
26-------------
27absl
28pandas
29statsmodels
30"""
31
32from typing import Sequence33
34import logging35
36from absl import app37from absl import flags38
39import pandas as pd40import statsmodels.stats.api as sms41
42flags.DEFINE_string(43"results_tsv_file", "",44"Results text file in tab-separated (tsv) format.")45
46FLAGS = flags.FLAGS47
48
49def _to_str(stats):50"""Retrieves basic stats from the object."""51return f"mean: {stats.mean} var: {stats.var} std: {stats.std}"52
53
54def main(argv):55if len(argv) > 1:56raise app.UsageError("Too many command-line arguments.")57if not FLAGS.results_tsv_file:58raise app.UsageError("Specify --results_tsv_file [FILE]!")59
60logging.info(f"Reading metrics from {FLAGS.results_tsv_file} ...")61df = pd.read_csv(FLAGS.results_tsv_file, sep="\t", header=None)62logging.info(f"Read {df.shape[0]} samples")63num_train_toks = list(df[0]) # Token can be char or word.64train_stats = sms.DescrStatsW(num_train_toks)65logging.info(f"Train stats: {_to_str(train_stats)}")66num_test_toks = list(df[1])67test_stats = sms.DescrStatsW(num_test_toks)68logging.info(f"Test stats: {_to_str(test_stats)}")69
70
71if __name__ == "__main__":72app.run(main)73