naeval
1{
2"cells": [
3{
4"cell_type": "code",
5"execution_count": null,
6"metadata": {},
7"outputs": [],
8"source": [
9"%load_ext autoreload\n",
10"%autoreload 2\n",
11"%run -n main.py"
12]
13},
14{
15"cell_type": "markdown",
16"metadata": {},
17"source": [
18"# datasets"
19]
20},
21{
22"cell_type": "code",
23"execution_count": null,
24"metadata": {},
25"outputs": [],
26"source": [
27"# path = join_path(DATA_DIR, DATASET)\n",
28"# !mkdir -p {path}"
29]
30},
31{
32"cell_type": "code",
33"execution_count": null,
34"metadata": {},
35"outputs": [],
36"source": [
37"# for name in DATASETS:\n",
38"# paths = (\n",
39"# join_path(CORUS_DATA_DIR, _)\n",
40"# for _ in CORUS_FILES[name]\n",
41"# )\n",
42"# records = (\n",
43"# record\n",
44"# for path in paths\n",
45"# for record in load_dataset(path)\n",
46"# )\n",
47"# records = log_progress(records, desc=name)\n",
48"# records = sample(records, 1000)\n",
49"\n",
50"# path = join_path(DATA_DIR, DATASET, name + JL + GZ)\n",
51"# items = as_jsons(records)\n",
52"# lines = format_jl(items)\n",
53"# dump_gz_lines(lines, path)"
54]
55},
56{
57"cell_type": "code",
58"execution_count": null,
59"metadata": {},
60"outputs": [],
61"source": [
62"datasets = {}\n",
63"for name in DATASETS:\n",
64" path = join_path(DATA_DIR, DATASET, name + JL + GZ)\n",
65" lines = load_gz_lines(path)\n",
66" items = parse_jl(lines)\n",
67" datasets[name] = list(from_jsons(items, Markup))"
68]
69},
70{
71"cell_type": "markdown",
72"metadata": {},
73"source": [
74"# models"
75]
76},
77{
78"cell_type": "code",
79"execution_count": null,
80"metadata": {},
81"outputs": [],
82"source": [
83"# for name in MODELS:\n",
84"# path = join_path(DATA_DIR, name)\n",
85"# !mkdir -p {path}"
86]
87},
88{
89"cell_type": "markdown",
90"metadata": {},
91"source": [
92"## cpu"
93]
94},
95{
96"cell_type": "code",
97"execution_count": null,
98"metadata": {},
99"outputs": [],
100"source": [
101"# docker = docker_client()"
102]
103},
104{
105"cell_type": "code",
106"execution_count": null,
107"metadata": {},
108"outputs": [],
109"source": [
110"# model_name = SPACY\n",
111"# model = MODELS[model_name]()\n",
112"# model.start(docker)\n",
113"# model.wait()"
114]
115},
116{
117"cell_type": "code",
118"execution_count": null,
119"metadata": {},
120"outputs": [],
121"source": [
122"# for dataset_name in DATASETS:\n",
123"# records = model.map(_.words for _ in datasets[dataset_name])\n",
124"# records = log_progress(records, desc=dataset_name)\n",
125"\n",
126"# path = join_path(DATA_DIR, model_name, dataset_name + JL + GZ)\n",
127"# items = as_jsons(records)\n",
128"# lines = format_jl(items)\n",
129"# dump_gz_lines(lines, path)"
130]
131},
132{
133"cell_type": "code",
134"execution_count": null,
135"metadata": {},
136"outputs": [],
137"source": [
138"# model.stop(docker)"
139]
140},
141{
142"cell_type": "markdown",
143"metadata": {},
144"source": [
145"## gpu"
146]
147},
148{
149"cell_type": "code",
150"execution_count": null,
151"metadata": {},
152"outputs": [],
153"source": [
154"# !vast search offers | grep '1 x GTX 1080 Ti'"
155]
156},
157{
158"cell_type": "code",
159"execution_count": null,
160"metadata": {},
161"outputs": [],
162"source": [
163"# model = DeeppavlovModel()\n",
164"# model = DeeppavlovBERTModel()\n",
165"# model = SlovnetBERTModel()\n",
166"model = StanzaModel()"
167]
168},
169{
170"cell_type": "code",
171"execution_count": null,
172"metadata": {},
173"outputs": [],
174"source": [
175"# !vast create instance 498741 --image {model.image} --disk 30"
176]
177},
178{
179"cell_type": "code",
180"execution_count": null,
181"metadata": {},
182"outputs": [],
183"source": [
184"# !vast show instances"
185]
186},
187{
188"cell_type": "code",
189"execution_count": null,
190"metadata": {},
191"outputs": [],
192"source": [
193"# !ssh ssh4.vast.ai -p 20861 -l root -Nf -L {model.port}:localhost:{model.container_port}"
194]
195},
196{
197"cell_type": "code",
198"execution_count": null,
199"metadata": {},
200"outputs": [],
201"source": [
202"# for dataset_name in DATASETS:\n",
203"# records = datasets[dataset_name]\n",
204"# records = log_progress(records, desc=dataset_name)\n",
205"# records = model.map(_.words for _ in records)\n",
206"\n",
207"# path = join_path(DATA_DIR, model.name, dataset_name + JL + GZ)\n",
208"# items = as_jsons(records)\n",
209"# lines = format_jl(items)\n",
210"# dump_gz_lines(lines, path)"
211]
212},
213{
214"cell_type": "code",
215"execution_count": null,
216"metadata": {},
217"outputs": [],
218"source": [
219"# !vast destroy instance 500861"
220]
221},
222{
223"cell_type": "markdown",
224"metadata": {},
225"source": [
226"# score"
227]
228},
229{
230"cell_type": "code",
231"execution_count": null,
232"metadata": {},
233"outputs": [],
234"source": [
235"dataset_models = {}\n",
236"for dataset in DATASETS:\n",
237" for model in MODELS:\n",
238" path = join_path(DATA_DIR, model, dataset + JL + GZ)\n",
239" lines = load_gz_lines(path)\n",
240" items = parse_jl(lines)\n",
241" dataset_models[dataset, model] = list(from_jsons(items, Markup))"
242]
243},
244{
245"cell_type": "code",
246"execution_count": null,
247"metadata": {},
248"outputs": [],
249"source": [
250"scores = {}\n",
251"for dataset, model in log_progress(dataset_models):\n",
252" preds = dataset_models[dataset, model]\n",
253" targets = datasets[dataset]\n",
254" scores[dataset, model] = score_markups(preds, targets)"
255]
256},
257{
258"cell_type": "markdown",
259"metadata": {},
260"source": [
261"# report"
262]
263},
264{
265"cell_type": "code",
266"execution_count": null,
267"metadata": {},
268"outputs": [],
269"source": [
270"scores_table = scores_report_table(scores, DATASETS, MODELS)\n",
271"html = format_scores_report(scores_table)\n",
272"patch_readme(MORPH1, html, README)\n",
273"patch_readme(MORPH1, html, SLOVNET_README)\n",
274"HTML(html)"
275]
276},
277{
278"cell_type": "code",
279"execution_count": null,
280"metadata": {},
281"outputs": [],
282"source": [
283"BENCH = [\n",
284" Bench(\n",
285" DEEPPAVLOV,\n",
286" init=4,\n",
287" disk=32 * MB,\n",
288" ram=10 * GB,\n",
289" speed=90,\n",
290" device=GPU\n",
291" ),\n",
292" Bench(\n",
293" DEEPPAVLOV_BERT,\n",
294" init=20,\n",
295" disk=(706 + 687) * MB, # BERT + model\n",
296" ram=8.5 * GB,\n",
297" speed=85,\n",
298" device=GPU\n",
299" ),\n",
300" Bench(\n",
301" SLOVNET_BERT,\n",
302" init=5,\n",
303" disk=475 * MB,\n",
304" ram=8087 * MB,\n",
305" speed=285,\n",
306" device=GPU\n",
307" ),\n",
308" Bench(\n",
309" SLOVNET,\n",
310" init=1,\n",
311" disk=27 * MB,\n",
312" ram=115 * MB,\n",
313" speed=532,\n",
314" ),\n",
315"\n",
316" Bench(\n",
317" UDPIPE,\n",
318" init=6.91,\n",
319" disk=45 * MB,\n",
320" ram=242 * MB,\n",
321" speed=56.2,\n",
322" ),\n",
323" Bench(\n",
324" SPACY,\n",
325" init=8,\n",
326" disk=140 * MB,\n",
327" ram=579 * MB,\n",
328" speed=50,\n",
329" ),\n",
330" Bench(\n",
331" MARU,\n",
332" init=15.8,\n",
333" disk=44 * MB,\n",
334" ram=370 * MB,\n",
335" speed=36.4,\n",
336" ),\n",
337" Bench(\n",
338" RNNMORPH,\n",
339" init=8.73,\n",
340" disk=10 * MB,\n",
341" ram=289 * MB,\n",
342" speed=16.6,\n",
343" ),\n",
344" Bench(\n",
345" RUPOSTAGGER,\n",
346" init=4.81,\n",
347" disk=2.7 * MB,\n",
348" ram=118 * MB,\n",
349" speed=48,\n",
350" ),\n",
351" Bench(\n",
352" STANZA,\n",
353" init=2,\n",
354" disk=591 * MB,\n",
355" ram=393 * MB,\n",
356" speed=92,\n",
357" ),\n",
358"]\n",
359"\n",
360"bench_table = bench_report_table(BENCH, MODELS)\n",
361"html = format_bench_report(bench_table)\n",
362"patch_readme(MORPH2, html, README)\n",
363"patch_readme(MORPH2, html, SLOVNET_README)\n",
364"HTML(html)"
365]
366},
367{
368"cell_type": "code",
369"execution_count": null,
370"metadata": {},
371"outputs": [],
372"source": []
373}
374],
375"metadata": {
376"kernelspec": {
377"display_name": "Python 3",
378"language": "python",
379"name": "python3"
380},
381"language_info": {
382"codemirror_mode": {
383"name": "ipython",
384"version": 3
385},
386"file_extension": ".py",
387"mimetype": "text/x-python",
388"name": "python",
389"nbconvert_exporter": "python",
390"pygments_lexer": "ipython3",
391"version": "3.6.9"
392}
393},
394"nbformat": 4,
395"nbformat_minor": 2
396}
397