naeval

main.ipynb
273 строки · 7.8 Кб
Перенос по словам
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": null,
6
   "metadata": {},
7
   "outputs": [],
8
   "source": [
9
    "%load_ext autoreload\n",
10
    "%autoreload 2\n",
11
    "%run -n main.py"
12
   ]
13
  },
14
  {
15
   "cell_type": "markdown",
16
   "metadata": {},
17
   "source": [
18
    "# datasets"
19
   ]
20
  },
21
  {
22
   "cell_type": "code",
23
   "execution_count": null,
24
   "metadata": {},
25
   "outputs": [],
26
   "source": [
27
    "# %run -n main.py\n",
28
    "# for name in [SENT, TOKEN]:\n",
29
    "#     path = join_path(DATA_DIR, name, DATASET)\n",
30
    "#     !mkdir -p {path}"
31
   ]
32
  },
33
  {
34
   "cell_type": "code",
35
   "execution_count": null,
36
   "metadata": {},
37
   "outputs": [],
38
   "source": [
39
    "# for type in [TOKEN, SENT]:\n",
40
    "#     log(type)\n",
41
    "#     for name in [CORPORA, SYNTAG, GICRYA, RNC]:\n",
42
    "#         paths = (\n",
43
    "#             join_path(CORUS_DATA_DIR, _)\n",
44
    "#             for _ in CORUS_FILES[name]\n",
45
    "#         )\n",
46
    "#         records = (\n",
47
    "#             record\n",
48
    "#             for path in paths\n",
49
    "#             for record in DATASETS[name](path)\n",
50
    "#         )\n",
51
    "#         records = log_progress(records, desc=name)\n",
52
    "#         partitions = PARSES[type][name](records)\n",
53
    "#         lines = format_partitions(partitions)\n",
54
    "#         lines = sample(lines)\n",
55
    "#         path = join_path(DATA_DIR, type, DATASET, name + JL + GZ)\n",
56
    "#         dump_gz_lines(lines, path)"
57
   ]
58
  },
59
  {
60
   "cell_type": "markdown",
61
   "metadata": {},
62
   "source": [
63
    "# models"
64
   ]
65
  },
66
  {
67
   "cell_type": "code",
68
   "execution_count": null,
69
   "metadata": {},
70
   "outputs": [],
71
   "source": [
72
    "# for type in [TOKEN, SENT]:\n",
73
    "#     for name in MODELS[type]:\n",
74
    "#         path = join_path(DATA_DIR, type, name)\n",
75
    "#         !mkdir -p {path}"
76
   ]
77
  },
78
  {
79
   "cell_type": "code",
80
   "execution_count": null,
81
   "metadata": {},
82
   "outputs": [],
83
   "source": [
84
    "# !pip install rusenttokenize\n",
85
    "# !pip install segtok\n",
86
    "# !pip install mosestokenizer\n",
87
    "# !pip install razdel\n",
88
    "\n",
89
    "# !pip install spacy\n",
90
    "# !pip install https://github.com/aatimofeev/spacy_russian_tokenizer/archive/master.zip#egg=python-simhash\n",
91
    "# !pip install https://github.com/Koziev/rutokenizer/archive/master.zip#egg=rutokenizer\n",
92
    "# !pip install pymystem3\n",
93
    "\n",
94
    "# !pip install nltk\n",
95
    "# nltk.download('punkt')\n",
96
    "# !wget https://raw.githubusercontent.com/mhq/train_punkt/master/russian.pickle -O ~/nltk_data/tokenizers/punkt/PY3/russian.pickle\n",
97
    "\n",
98
    "# Texterra\n",
99
    "# Можно ещё сравнивать с https://texterra.ispras.ru/products, но\n",
100
    "# 1. она медленно работает, как минимум затраты на http\n",
101
    "# 2. иногда кидает ошибку (возможно дело в английских предложениях)\n",
102
    "# 3. качество немного выше segtok\n",
103
    "\n",
104
    "# Polyglot\n",
105
    "# реализует http://www.unicode.org/reports/tr29/\n",
106
    "\n",
107
    "# Сорян, не смог установить. Дикие траблы с ICU\n",
108
    "# brew install icu4c\n",
109
    "# export ICU_VERSION=62.1\n",
110
    "# export BASE=/usr/local/Cellar/icu4c/\n",
111
    "# export PATH=$PATH:$BASE/$ICU_VERSION/bin\n",
112
    "# export PYICU_INCLUDES=$BASE/$ICU_VERSION/include\n",
113
    "# export PYICU_LFLAGS=-L$BASE/$ICU_VERSION/lib\n",
114
    "# pip install pyicu polyglot\n",
115
    "\n",
116
    "# Вроде установилось но \n",
117
    "# > from polyglot.text import Text\n",
118
    "# > Text('...')\n",
119
    "# Symbol not found: __ZNK6icu_6214Transliterator12getTargetSetERNS_10UnicodeSetE"
120
   ]
121
  },
122
  {
123
   "cell_type": "code",
124
   "execution_count": null,
125
   "metadata": {
126
    "scrolled": false
127
   },
128
   "outputs": [],
129
   "source": [
130
    "# for type in [SENT, TOKEN]:\n",
131
    "#     log(type)\n",
132
    "#     for model_name in MODELS[type]:\n",
133
    "#         log(model_name)\n",
134
    "#         model = MODELS[type][model_name]\n",
135
    "#         if is_class(model):\n",
136
    "#             model = model()\n",
137
    "#         model = Timing(model)\n",
138
    "\n",
139
    "#         for dataset_name in DATASETS:\n",
140
    "#             path = join_path(DATA_DIR, type, DATASET, dataset_name + JL + GZ)\n",
141
    "#             lines = load_gz_lines(path)\n",
142
    "#             records = parse_partitions(lines)\n",
143
    "#             records = log_progress(records, desc=dataset_name)\n",
144
    "\n",
145
    "#             records = (\n",
146
    "#                 Partition.from_substrings(model(_.text))\n",
147
    "#                 for _ in records\n",
148
    "#             )\n",
149
    "#             path = join_path(DATA_DIR, type, model_name, dataset_name + JL + GZ)\n",
150
    "#             lines = format_partitions(records)\n",
151
    "#             dump_gz_lines(lines, path)\n",
152
    "\n",
153
    "#             path = join_path(DATA_DIR, STATS + JL)\n",
154
    "#             record = [[type, model_name, dataset_name], model.time]\n",
155
    "#             lines = format_jl([record])\n",
156
    "#             append_lines(lines, path)\n",
157
    "\n",
158
    "#             model.reset()"
159
   ]
160
  },
161
  {
162
   "cell_type": "markdown",
163
   "metadata": {},
164
   "source": [
165
    "# score"
166
   ]
167
  },
168
  {
169
   "cell_type": "code",
170
   "execution_count": null,
171
   "metadata": {},
172
   "outputs": [],
173
   "source": [
174
    "scores = {}\n",
175
    "keys = [\n",
176
    "    (type, model, dataset)\n",
177
    "    for type in [TOKEN, SENT]\n",
178
    "    for model in MODELS[type]\n",
179
    "    for dataset in DATASETS\n",
180
    "]\n",
181
    "for type, model, dataset in log_progress(keys):\n",
182
    "    path = join_path(DATA_DIR, type, DATASET, dataset + JL + GZ)\n",
183
    "    lines = load_gz_lines(path)\n",
184
    "    targets = parse_partitions(lines)\n",
185
    "\n",
186
    "    path = join_path(DATA_DIR, type, model, dataset + JL + GZ)\n",
187
    "    lines = load_gz_lines(path)\n",
188
    "    preds = parse_partitions(lines)\n",
189
    "\n",
190
    "    score = score_partitions(preds, targets)\n",
191
    "    scores[type, model, dataset] = score"
192
   ]
193
  },
194
  {
195
   "cell_type": "markdown",
196
   "metadata": {},
197
   "source": [
198
    "# report"
199
   ]
200
  },
201
  {
202
   "cell_type": "code",
203
   "execution_count": null,
204
   "metadata": {},
205
   "outputs": [],
206
   "source": [
207
    "path = join_path(DATA_DIR, STATS + JL)\n",
208
    "lines = load_lines(path)\n",
209
    "items = parse_jl(lines)\n",
210
    "times = {\n",
211
    "    tuple(key): time\n",
212
    "    for key, time in items\n",
213
    "}"
214
   ]
215
  },
216
  {
217
   "cell_type": "code",
218
   "execution_count": null,
219
   "metadata": {},
220
   "outputs": [],
221
   "source": [
222
    "token_labels = dict(model_labels(MODELS, TOKEN))\n",
223
    "token_table = report_table(scores, times, DATASETS, MODELS, TOKEN)\n",
224
    "html = format_report(token_table, token_labels)\n",
225
    "patch_readme(TOKEN, html, README)\n",
226
    "patch_readme(TOKEN, html, RAZDEL_README)\n",
227
    "HTML(html)"
228
   ]
229
  },
230
  {
231
   "cell_type": "code",
232
   "execution_count": null,
233
   "metadata": {},
234
   "outputs": [],
235
   "source": [
236
    "sent_labels = dict(model_labels(MODELS, SENT))\n",
237
    "sent_table = report_table(scores, times, DATASETS, MODELS, SENT)\n",
238
    "html = format_report(sent_table, sent_labels)\n",
239
    "patch_readme(SENT, html, README)\n",
240
    "patch_readme(SENT, html, RAZDEL_README)\n",
241
    "HTML(html)"
242
   ]
243
  },
244
  {
245
   "cell_type": "code",
246
   "execution_count": null,
247
   "metadata": {},
248
   "outputs": [],
249
   "source": []
250
  }
251
 ],
252
 "metadata": {
253
  "kernelspec": {
254
   "display_name": "Python 3",
255
   "language": "python",
256
   "name": "python3"
257
  },
258
  "language_info": {
259
   "codemirror_mode": {
260
    "name": "ipython",
261
    "version": 3
262
   },
263
   "file_extension": ".py",
264
   "mimetype": "text/x-python",
265
   "name": "python",
266
   "nbconvert_exporter": "python",
267
   "pygments_lexer": "ipython3",
268
   "version": "3.6.9"
269
  }
270
 },
271
 "nbformat": 4,
272
 "nbformat_minor": 2
273
}
274
naeval

Использование cookies