naeval
1{
2"cells": [
3{
4"cell_type": "code",
5"execution_count": null,
6"metadata": {},
7"outputs": [],
8"source": [
9"%load_ext autoreload\n",
10"%autoreload 2\n",
11"%run -n main.py"
12]
13},
14{
15"cell_type": "markdown",
16"metadata": {},
17"source": [
18"# datasets"
19]
20},
21{
22"cell_type": "code",
23"execution_count": null,
24"metadata": {},
25"outputs": [],
26"source": [
27"# path = join_path(DATA_DIR, DATASET)\n",
28"# !mkdir -p {path}"
29]
30},
31{
32"cell_type": "code",
33"execution_count": null,
34"metadata": {
35"scrolled": false
36},
37"outputs": [],
38"source": [
39"# for name in [NE5, BSNLP, FACTRU, GAREEV]:\n",
40"# path = join_path(CORUS_DATA_DIR, CORUS_FILES[name])\n",
41"# records = LOADS[name](path)\n",
42"# records = (_.adapted for _ in records)\n",
43"# records = log_progress(records, desc=name)\n",
44"\n",
45"# path = join_path(DATA_DIR, DATASET, name + JL + GZ)\n",
46"# items = as_jsons(records)\n",
47"# lines = format_jl(items)\n",
48"# dump_gz_lines(lines, path)"
49]
50},
51{
52"cell_type": "code",
53"execution_count": null,
54"metadata": {},
55"outputs": [],
56"source": [
57"%run -n main.py\n",
58"datasets = {}\n",
59"for name in [NE5, BSNLP, FACTRU, GAREEV]:\n",
60" path = join_path(DATA_DIR, DATASET, name + JL + GZ)\n",
61" lines = load_gz_lines(path)\n",
62" items = parse_jl(lines)\n",
63" datasets[name] = list(from_jsons(items, Markup))"
64]
65},
66{
67"cell_type": "markdown",
68"metadata": {},
69"source": [
70"# models"
71]
72},
73{
74"cell_type": "code",
75"execution_count": null,
76"metadata": {},
77"outputs": [],
78"source": [
79"# for name in MODELS:\n",
80"# path = join_path(DATA_DIR, name)\n",
81"# !mkdir -p {path}"
82]
83},
84{
85"cell_type": "markdown",
86"metadata": {},
87"source": [
88"## cpu"
89]
90},
91{
92"cell_type": "code",
93"execution_count": null,
94"metadata": {},
95"outputs": [],
96"source": [
97"# docker = docker_client()"
98]
99},
100{
101"cell_type": "code",
102"execution_count": null,
103"metadata": {},
104"outputs": [],
105"source": [
106"# model_name = SPACY # MITIE, TOMITA, TEXTERRA, SLOVNET\n",
107"# model = MODELS[model_name]()\n",
108"# model.start(docker)\n",
109"# model.wait()"
110]
111},
112{
113"cell_type": "code",
114"execution_count": null,
115"metadata": {},
116"outputs": [],
117"source": [
118"# for dataset_name in [NE5, BSNLP, FACTRU, GAREEV]:\n",
119"# records = model.map(_.text for _ in datasets[dataset_name])\n",
120"# records = (_.adapted for _ in records)\n",
121"# records = log_progress(records, desc=dataset_name)\n",
122"\n",
123"# path = join_path(DATA_DIR, model_name, dataset_name + JL + GZ)\n",
124"# items = as_jsons(records)\n",
125"# lines = format_jl(items)\n",
126"# dump_gz_lines(lines, path)"
127]
128},
129{
130"cell_type": "code",
131"execution_count": null,
132"metadata": {},
133"outputs": [],
134"source": [
135"# model.stop(docker)"
136]
137},
138{
139"cell_type": "markdown",
140"metadata": {},
141"source": [
142"## gpu"
143]
144},
145{
146"cell_type": "code",
147"execution_count": null,
148"metadata": {},
149"outputs": [],
150"source": [
151"# # select cuda 10.0, high bandwith\n",
152"# !vast search offers | grep '1 x GTX 1080 Ti'"
153]
154},
155{
156"cell_type": "code",
157"execution_count": null,
158"metadata": {},
159"outputs": [],
160"source": [
161"# %run -n main\n",
162"# model = DeeppavlovModel()\n",
163"# model = DeeppavlovBERTModel()\n",
164"# model = SlovnetBERTModel()\n",
165"# model = StanzaModel()\n",
166"# model = DeeppavlovSlavicModel()"
167]
168},
169{
170"cell_type": "code",
171"execution_count": null,
172"metadata": {},
173"outputs": [],
174"source": [
175"# !vast create instance 577392 --image {model.image} --disk 20"
176]
177},
178{
179"cell_type": "code",
180"execution_count": null,
181"metadata": {},
182"outputs": [],
183"source": [
184"# !vast show instances"
185]
186},
187{
188"cell_type": "code",
189"execution_count": null,
190"metadata": {},
191"outputs": [],
192"source": [
193"# !ssh ssh5.vast.ai -p 19600 -l root -Nf -L {model.port}:localhost:{model.container_port}"
194]
195},
196{
197"cell_type": "code",
198"execution_count": null,
199"metadata": {
200"scrolled": false
201},
202"outputs": [],
203"source": [
204"# for dataset_name in [NE5, BSNLP, FACTRU, GAREEV]:\n",
205"# records = datasets[dataset_name]\n",
206"# records = log_progress(records, desc=dataset_name)\n",
207"# records = model.map(_.text for _ in records)\n",
208"# records = (_.adapted for _ in records)\n",
209"\n",
210"# path = join_path(DATA_DIR, model.name, dataset_name + JL + GZ)\n",
211"# items = as_jsons(records)\n",
212"# lines = format_jl(items)\n",
213"# dump_gz_lines(lines, path)"
214]
215},
216{
217"cell_type": "code",
218"execution_count": null,
219"metadata": {},
220"outputs": [],
221"source": [
222"# !vast destroy instance 585391"
223]
224},
225{
226"cell_type": "markdown",
227"metadata": {},
228"source": [
229"# score"
230]
231},
232{
233"cell_type": "code",
234"execution_count": null,
235"metadata": {},
236"outputs": [],
237"source": [
238"dataset_models = {}\n",
239"for dataset in DATASETS:\n",
240" for model in MODELS:\n",
241" path = join_path(DATA_DIR, model, dataset + JL + GZ)\n",
242" lines = load_gz_lines(path)\n",
243" items = parse_jl(lines)\n",
244" dataset_models[dataset, model] = list(from_jsons(items, Markup))"
245]
246},
247{
248"cell_type": "code",
249"execution_count": null,
250"metadata": {},
251"outputs": [],
252"source": [
253"scores = {}\n",
254"for dataset, model in log_progress(dataset_models):\n",
255" preds = dataset_models[dataset, model]\n",
256" targets = datasets[dataset]\n",
257" scores[dataset, model] = score_markups(preds, targets)"
258]
259},
260{
261"cell_type": "markdown",
262"metadata": {},
263"source": [
264"# report"
265]
266},
267{
268"cell_type": "code",
269"execution_count": null,
270"metadata": {},
271"outputs": [],
272"source": [
273"scores_table = scores_report_table(scores, DATASETS, MODELS)\n",
274"html = format_scores_report(scores_table)\n",
275"HTML(html)"
276]
277},
278{
279"cell_type": "code",
280"execution_count": null,
281"metadata": {},
282"outputs": [],
283"source": [
284"html = format_github_scores_report(scores_table)\n",
285"patch_readme(NER1, html, README)\n",
286"patch_readme(NER1, html, SLOVNET_README)\n",
287"HTML(html)"
288]
289},
290{
291"cell_type": "code",
292"execution_count": null,
293"metadata": {},
294"outputs": [],
295"source": [
296"BENCH = [\n",
297" # GTX 1080 Ti\n",
298" Bench(\n",
299" DEEPPAVLOV,\n",
300" init=5.9,\n",
301" disk=1 * GB, # 1GB emb + 5MB model\n",
302" ram=3 * GB,\n",
303" speed=24.31, # 1.95 / 7 cores on CPU,\n",
304" device=GPU\n",
305" ),\n",
306" Bench(\n",
307" DEEPPAVLOV_BERT,\n",
308" init=34.5,\n",
309" disk=2 * GB,\n",
310" ram=6 * GB,\n",
311" speed=13.13, # 17.71 / 3 cores on CPU\n",
312" device=GPU\n",
313" ),\n",
314" Bench(\n",
315" DEEPPAVLOV_SLAVIC,\n",
316" init=35,\n",
317" disk=2 * GB,\n",
318" ram=4 * GB,\n",
319" speed=8,\n",
320" device=GPU\n",
321" ),\n",
322" Bench(\n",
323" SLOVNET_BERT,\n",
324" init=5,\n",
325" disk=473 * MB,\n",
326" ram=9500 * MB,\n",
327" speed=40,\n",
328" device=GPU\n",
329" ),\n",
330" Bench(\n",
331" SLOVNET,\n",
332" init=1,\n",
333" disk=27 * MB,\n",
334" ram=205 * MB,\n",
335" speed=25.3,\n",
336" ),\n",
337" \n",
338" # 16 CPUs\n",
339" Bench(\n",
340" PULLENTI,\n",
341" init=2.85,\n",
342" disk=16 * MB,\n",
343" ram=253 * MB,\n",
344" speed=6.05\n",
345" ),\n",
346" Bench(\n",
347" TEXTERRA,\n",
348" init=47.6,\n",
349" disk=193 * MB,\n",
350" ram=3.3 * GB, # leaks\n",
351" speed=20.16 / 5 # utils ~5 cores\n",
352" ),\n",
353" Bench(\n",
354" TOMITA,\n",
355" init=2.03,\n",
356" disk=64 * MB,\n",
357" ram=63 * MB,\n",
358" speed=29.8,\n",
359" ),\n",
360" Bench(\n",
361" MITIE,\n",
362" init=28.3,\n",
363" disk=327 * MB,\n",
364" ram=261 * MB,\n",
365" speed=32.8,\n",
366" ), \n",
367" Bench(\n",
368" SPACY,\n",
369" init=8,\n",
370" disk=140 * MB,\n",
371" ram=625 * MB,\n",
372" speed=8,\n",
373" ),\n",
374" Bench(\n",
375" STANZA,\n",
376" init=3,\n",
377" disk=591 * MB,\n",
378" ram=11 * GB,\n",
379" speed=3,\n",
380" device=GPU\n",
381" ),\n",
382"]\n",
383"\n",
384"bench_table = bench_report_table(BENCH, MODELS)\n",
385"html = format_bench_report(bench_table)\n",
386"patch_readme(NER2, html, README)\n",
387"patch_readme(NER2, html, SLOVNET_README)\n",
388"HTML(html)"
389]
390},
391{
392"cell_type": "code",
393"execution_count": null,
394"metadata": {},
395"outputs": [],
396"source": []
397}
398],
399"metadata": {
400"kernelspec": {
401"display_name": "Python 3",
402"language": "python",
403"name": "python3"
404},
405"language_info": {
406"codemirror_mode": {
407"name": "ipython",
408"version": 3
409},
410"file_extension": ".py",
411"mimetype": "text/x-python",
412"name": "python",
413"nbconvert_exporter": "python",
414"pygments_lexer": "ipython3",
415"version": "3.6.9"
416}
417},
418"nbformat": 4,
419"nbformat_minor": 2
420}
421