naeval

main.ipynb
420 строк · 9.6 Кб
Перенос по словам
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": null,
6
   "metadata": {},
7
   "outputs": [],
8
   "source": [
9
    "%load_ext autoreload\n",
10
    "%autoreload 2\n",
11
    "%run -n main.py"
12
   ]
13
  },
14
  {
15
   "cell_type": "markdown",
16
   "metadata": {},
17
   "source": [
18
    "# datasets"
19
   ]
20
  },
21
  {
22
   "cell_type": "code",
23
   "execution_count": null,
24
   "metadata": {},
25
   "outputs": [],
26
   "source": [
27
    "# path = join_path(DATA_DIR, DATASET)\n",
28
    "# !mkdir -p {path}"
29
   ]
30
  },
31
  {
32
   "cell_type": "code",
33
   "execution_count": null,
34
   "metadata": {
35
    "scrolled": false
36
   },
37
   "outputs": [],
38
   "source": [
39
    "# for name in [NE5, BSNLP, FACTRU, GAREEV]:\n",
40
    "#     path = join_path(CORUS_DATA_DIR, CORUS_FILES[name])\n",
41
    "#     records = LOADS[name](path)\n",
42
    "#     records = (_.adapted for _ in records)\n",
43
    "#     records = log_progress(records, desc=name)\n",
44
    "\n",
45
    "#     path = join_path(DATA_DIR, DATASET, name + JL + GZ)\n",
46
    "#     items = as_jsons(records)\n",
47
    "#     lines = format_jl(items)\n",
48
    "#     dump_gz_lines(lines, path)"
49
   ]
50
  },
51
  {
52
   "cell_type": "code",
53
   "execution_count": null,
54
   "metadata": {},
55
   "outputs": [],
56
   "source": [
57
    "%run -n main.py\n",
58
    "datasets = {}\n",
59
    "for name in [NE5, BSNLP, FACTRU, GAREEV]:\n",
60
    "    path = join_path(DATA_DIR, DATASET, name + JL + GZ)\n",
61
    "    lines = load_gz_lines(path)\n",
62
    "    items = parse_jl(lines)\n",
63
    "    datasets[name] = list(from_jsons(items, Markup))"
64
   ]
65
  },
66
  {
67
   "cell_type": "markdown",
68
   "metadata": {},
69
   "source": [
70
    "# models"
71
   ]
72
  },
73
  {
74
   "cell_type": "code",
75
   "execution_count": null,
76
   "metadata": {},
77
   "outputs": [],
78
   "source": [
79
    "# for name in MODELS:\n",
80
    "#     path = join_path(DATA_DIR, name)\n",
81
    "#     !mkdir -p {path}"
82
   ]
83
  },
84
  {
85
   "cell_type": "markdown",
86
   "metadata": {},
87
   "source": [
88
    "## cpu"
89
   ]
90
  },
91
  {
92
   "cell_type": "code",
93
   "execution_count": null,
94
   "metadata": {},
95
   "outputs": [],
96
   "source": [
97
    "# docker = docker_client()"
98
   ]
99
  },
100
  {
101
   "cell_type": "code",
102
   "execution_count": null,
103
   "metadata": {},
104
   "outputs": [],
105
   "source": [
106
    "# model_name = SPACY  # MITIE, TOMITA, TEXTERRA, SLOVNET\n",
107
    "# model = MODELS[model_name]()\n",
108
    "# model.start(docker)\n",
109
    "# model.wait()"
110
   ]
111
  },
112
  {
113
   "cell_type": "code",
114
   "execution_count": null,
115
   "metadata": {},
116
   "outputs": [],
117
   "source": [
118
    "# for dataset_name in [NE5, BSNLP, FACTRU, GAREEV]:\n",
119
    "#     records = model.map(_.text for _ in datasets[dataset_name])\n",
120
    "#     records = (_.adapted for _ in records)\n",
121
    "#     records = log_progress(records, desc=dataset_name)\n",
122
    "\n",
123
    "#     path = join_path(DATA_DIR, model_name, dataset_name + JL + GZ)\n",
124
    "#     items = as_jsons(records)\n",
125
    "#     lines = format_jl(items)\n",
126
    "#     dump_gz_lines(lines, path)"
127
   ]
128
  },
129
  {
130
   "cell_type": "code",
131
   "execution_count": null,
132
   "metadata": {},
133
   "outputs": [],
134
   "source": [
135
    "# model.stop(docker)"
136
   ]
137
  },
138
  {
139
   "cell_type": "markdown",
140
   "metadata": {},
141
   "source": [
142
    "## gpu"
143
   ]
144
  },
145
  {
146
   "cell_type": "code",
147
   "execution_count": null,
148
   "metadata": {},
149
   "outputs": [],
150
   "source": [
151
    "# # select cuda 10.0, high bandwith\n",
152
    "# !vast search offers | grep '1 x  GTX 1080 Ti'"
153
   ]
154
  },
155
  {
156
   "cell_type": "code",
157
   "execution_count": null,
158
   "metadata": {},
159
   "outputs": [],
160
   "source": [
161
    "# %run -n main\n",
162
    "# model = DeeppavlovModel()\n",
163
    "# model = DeeppavlovBERTModel()\n",
164
    "# model = SlovnetBERTModel()\n",
165
    "# model = StanzaModel()\n",
166
    "# model = DeeppavlovSlavicModel()"
167
   ]
168
  },
169
  {
170
   "cell_type": "code",
171
   "execution_count": null,
172
   "metadata": {},
173
   "outputs": [],
174
   "source": [
175
    "# !vast create instance 577392 --image {model.image} --disk 20"
176
   ]
177
  },
178
  {
179
   "cell_type": "code",
180
   "execution_count": null,
181
   "metadata": {},
182
   "outputs": [],
183
   "source": [
184
    "# !vast show instances"
185
   ]
186
  },
187
  {
188
   "cell_type": "code",
189
   "execution_count": null,
190
   "metadata": {},
191
   "outputs": [],
192
   "source": [
193
    "# !ssh ssh5.vast.ai -p 19600 -l root -Nf -L {model.port}:localhost:{model.container_port}"
194
   ]
195
  },
196
  {
197
   "cell_type": "code",
198
   "execution_count": null,
199
   "metadata": {
200
    "scrolled": false
201
   },
202
   "outputs": [],
203
   "source": [
204
    "# for dataset_name in [NE5, BSNLP, FACTRU, GAREEV]:\n",
205
    "#     records = datasets[dataset_name]\n",
206
    "#     records = log_progress(records, desc=dataset_name)\n",
207
    "#     records = model.map(_.text for _ in records)\n",
208
    "#     records = (_.adapted for _ in records)\n",
209
    "\n",
210
    "#     path = join_path(DATA_DIR, model.name, dataset_name + JL + GZ)\n",
211
    "#     items = as_jsons(records)\n",
212
    "#     lines = format_jl(items)\n",
213
    "#     dump_gz_lines(lines, path)"
214
   ]
215
  },
216
  {
217
   "cell_type": "code",
218
   "execution_count": null,
219
   "metadata": {},
220
   "outputs": [],
221
   "source": [
222
    "# !vast destroy instance 585391"
223
   ]
224
  },
225
  {
226
   "cell_type": "markdown",
227
   "metadata": {},
228
   "source": [
229
    "# score"
230
   ]
231
  },
232
  {
233
   "cell_type": "code",
234
   "execution_count": null,
235
   "metadata": {},
236
   "outputs": [],
237
   "source": [
238
    "dataset_models = {}\n",
239
    "for dataset in DATASETS:\n",
240
    "    for model in MODELS:\n",
241
    "        path = join_path(DATA_DIR, model, dataset + JL + GZ)\n",
242
    "        lines = load_gz_lines(path)\n",
243
    "        items = parse_jl(lines)\n",
244
    "        dataset_models[dataset, model] = list(from_jsons(items, Markup))"
245
   ]
246
  },
247
  {
248
   "cell_type": "code",
249
   "execution_count": null,
250
   "metadata": {},
251
   "outputs": [],
252
   "source": [
253
    "scores = {}\n",
254
    "for dataset, model in log_progress(dataset_models):\n",
255
    "    preds = dataset_models[dataset, model]\n",
256
    "    targets = datasets[dataset]\n",
257
    "    scores[dataset, model] = score_markups(preds, targets)"
258
   ]
259
  },
260
  {
261
   "cell_type": "markdown",
262
   "metadata": {},
263
   "source": [
264
    "# report"
265
   ]
266
  },
267
  {
268
   "cell_type": "code",
269
   "execution_count": null,
270
   "metadata": {},
271
   "outputs": [],
272
   "source": [
273
    "scores_table = scores_report_table(scores, DATASETS, MODELS)\n",
274
    "html = format_scores_report(scores_table)\n",
275
    "HTML(html)"
276
   ]
277
  },
278
  {
279
   "cell_type": "code",
280
   "execution_count": null,
281
   "metadata": {},
282
   "outputs": [],
283
   "source": [
284
    "html = format_github_scores_report(scores_table)\n",
285
    "patch_readme(NER1, html, README)\n",
286
    "patch_readme(NER1, html, SLOVNET_README)\n",
287
    "HTML(html)"
288
   ]
289
  },
290
  {
291
   "cell_type": "code",
292
   "execution_count": null,
293
   "metadata": {},
294
   "outputs": [],
295
   "source": [
296
    "BENCH = [\n",
297
    "    # GTX 1080 Ti\n",
298
    "    Bench(\n",
299
    "        DEEPPAVLOV,\n",
300
    "        init=5.9,\n",
301
    "        disk=1 * GB,  # 1GB emb + 5MB model\n",
302
    "        ram=3 * GB,\n",
303
    "        speed=24.31,  # 1.95 / 7 cores on CPU,\n",
304
    "        device=GPU\n",
305
    "    ),\n",
306
    "    Bench(\n",
307
    "        DEEPPAVLOV_BERT,\n",
308
    "        init=34.5,\n",
309
    "        disk=2 * GB,\n",
310
    "        ram=6 * GB,\n",
311
    "        speed=13.13,  # 17.71 / 3 cores on CPU\n",
312
    "        device=GPU\n",
313
    "    ),\n",
314
    "    Bench(\n",
315
    "        DEEPPAVLOV_SLAVIC,\n",
316
    "        init=35,\n",
317
    "        disk=2 * GB,\n",
318
    "        ram=4 * GB,\n",
319
    "        speed=8,\n",
320
    "        device=GPU\n",
321
    "    ),\n",
322
    "    Bench(\n",
323
    "        SLOVNET_BERT,\n",
324
    "        init=5,\n",
325
    "        disk=473 * MB,\n",
326
    "        ram=9500 * MB,\n",
327
    "        speed=40,\n",
328
    "        device=GPU\n",
329
    "    ),\n",
330
    "    Bench(\n",
331
    "        SLOVNET,\n",
332
    "        init=1,\n",
333
    "        disk=27 * MB,\n",
334
    "        ram=205 * MB,\n",
335
    "        speed=25.3,\n",
336
    "    ),\n",
337
    "    \n",
338
    "    # 16 CPUs\n",
339
    "    Bench(\n",
340
    "        PULLENTI,\n",
341
    "        init=2.85,\n",
342
    "        disk=16 * MB,\n",
343
    "        ram=253 * MB,\n",
344
    "        speed=6.05\n",
345
    "    ),\n",
346
    "    Bench(\n",
347
    "        TEXTERRA,\n",
348
    "        init=47.6,\n",
349
    "        disk=193 * MB,\n",
350
    "        ram=3.3 * GB,  # leaks\n",
351
    "        speed=20.16 / 5  # utils ~5 cores\n",
352
    "    ),\n",
353
    "    Bench(\n",
354
    "        TOMITA,\n",
355
    "        init=2.03,\n",
356
    "        disk=64 * MB,\n",
357
    "        ram=63 * MB,\n",
358
    "        speed=29.8,\n",
359
    "    ),\n",
360
    "    Bench(\n",
361
    "        MITIE,\n",
362
    "        init=28.3,\n",
363
    "        disk=327 * MB,\n",
364
    "        ram=261 * MB,\n",
365
    "        speed=32.8,\n",
366
    "    ), \n",
367
    "    Bench(\n",
368
    "        SPACY,\n",
369
    "        init=8,\n",
370
    "        disk=140 * MB,\n",
371
    "        ram=625 * MB,\n",
372
    "        speed=8,\n",
373
    "    ),\n",
374
    "    Bench(\n",
375
    "        STANZA,\n",
376
    "        init=3,\n",
377
    "        disk=591 * MB,\n",
378
    "        ram=11 * GB,\n",
379
    "        speed=3,\n",
380
    "        device=GPU\n",
381
    "    ),\n",
382
    "]\n",
383
    "\n",
384
    "bench_table = bench_report_table(BENCH, MODELS)\n",
385
    "html = format_bench_report(bench_table)\n",
386
    "patch_readme(NER2, html, README)\n",
387
    "patch_readme(NER2, html, SLOVNET_README)\n",
388
    "HTML(html)"
389
   ]
390
  },
391
  {
392
   "cell_type": "code",
393
   "execution_count": null,
394
   "metadata": {},
395
   "outputs": [],
396
   "source": []
397
  }
398
 ],
399
 "metadata": {
400
  "kernelspec": {
401
   "display_name": "Python 3",
402
   "language": "python",
403
   "name": "python3"
404
  },
405
  "language_info": {
406
   "codemirror_mode": {
407
    "name": "ipython",
408
    "version": 3
409
   },
410
   "file_extension": ".py",
411
   "mimetype": "text/x-python",
412
   "name": "python",
413
   "nbconvert_exporter": "python",
414
   "pygments_lexer": "ipython3",
415
   "version": "3.6.9"
416
  }
417
 },
418
 "nbformat": 4,
419
 "nbformat_minor": 2
420
}
421
naeval

Использование cookies