naeval
273 строки · 7.8 Кб
1{
2"cells": [
3{
4"cell_type": "code",
5"execution_count": null,
6"metadata": {},
7"outputs": [],
8"source": [
9"%load_ext autoreload\n",
10"%autoreload 2\n",
11"%run -n main.py"
12]
13},
14{
15"cell_type": "markdown",
16"metadata": {},
17"source": [
18"# datasets"
19]
20},
21{
22"cell_type": "code",
23"execution_count": null,
24"metadata": {},
25"outputs": [],
26"source": [
27"# %run -n main.py\n",
28"# for name in [SENT, TOKEN]:\n",
29"# path = join_path(DATA_DIR, name, DATASET)\n",
30"# !mkdir -p {path}"
31]
32},
33{
34"cell_type": "code",
35"execution_count": null,
36"metadata": {},
37"outputs": [],
38"source": [
39"# for type in [TOKEN, SENT]:\n",
40"# log(type)\n",
41"# for name in [CORPORA, SYNTAG, GICRYA, RNC]:\n",
42"# paths = (\n",
43"# join_path(CORUS_DATA_DIR, _)\n",
44"# for _ in CORUS_FILES[name]\n",
45"# )\n",
46"# records = (\n",
47"# record\n",
48"# for path in paths\n",
49"# for record in DATASETS[name](path)\n",
50"# )\n",
51"# records = log_progress(records, desc=name)\n",
52"# partitions = PARSES[type][name](records)\n",
53"# lines = format_partitions(partitions)\n",
54"# lines = sample(lines)\n",
55"# path = join_path(DATA_DIR, type, DATASET, name + JL + GZ)\n",
56"# dump_gz_lines(lines, path)"
57]
58},
59{
60"cell_type": "markdown",
61"metadata": {},
62"source": [
63"# models"
64]
65},
66{
67"cell_type": "code",
68"execution_count": null,
69"metadata": {},
70"outputs": [],
71"source": [
72"# for type in [TOKEN, SENT]:\n",
73"# for name in MODELS[type]:\n",
74"# path = join_path(DATA_DIR, type, name)\n",
75"# !mkdir -p {path}"
76]
77},
78{
79"cell_type": "code",
80"execution_count": null,
81"metadata": {},
82"outputs": [],
83"source": [
84"# !pip install rusenttokenize\n",
85"# !pip install segtok\n",
86"# !pip install mosestokenizer\n",
87"# !pip install razdel\n",
88"\n",
89"# !pip install spacy\n",
90"# !pip install https://github.com/aatimofeev/spacy_russian_tokenizer/archive/master.zip#egg=python-simhash\n",
91"# !pip install https://github.com/Koziev/rutokenizer/archive/master.zip#egg=rutokenizer\n",
92"# !pip install pymystem3\n",
93"\n",
94"# !pip install nltk\n",
95"# nltk.download('punkt')\n",
96"# !wget https://raw.githubusercontent.com/mhq/train_punkt/master/russian.pickle -O ~/nltk_data/tokenizers/punkt/PY3/russian.pickle\n",
97"\n",
98"# Texterra\n",
99"# Можно ещё сравнивать с https://texterra.ispras.ru/products, но\n",
100"# 1. она медленно работает, как минимум затраты на http\n",
101"# 2. иногда кидает ошибку (возможно дело в английских предложениях)\n",
102"# 3. качество немного выше segtok\n",
103"\n",
104"# Polyglot\n",
105"# реализует http://www.unicode.org/reports/tr29/\n",
106"\n",
107"# Сорян, не смог установить. Дикие траблы с ICU\n",
108"# brew install icu4c\n",
109"# export ICU_VERSION=62.1\n",
110"# export BASE=/usr/local/Cellar/icu4c/\n",
111"# export PATH=$PATH:$BASE/$ICU_VERSION/bin\n",
112"# export PYICU_INCLUDES=$BASE/$ICU_VERSION/include\n",
113"# export PYICU_LFLAGS=-L$BASE/$ICU_VERSION/lib\n",
114"# pip install pyicu polyglot\n",
115"\n",
116"# Вроде установилось но \n",
117"# > from polyglot.text import Text\n",
118"# > Text('...')\n",
119"# Symbol not found: __ZNK6icu_6214Transliterator12getTargetSetERNS_10UnicodeSetE"
120]
121},
122{
123"cell_type": "code",
124"execution_count": null,
125"metadata": {
126"scrolled": false
127},
128"outputs": [],
129"source": [
130"# for type in [SENT, TOKEN]:\n",
131"# log(type)\n",
132"# for model_name in MODELS[type]:\n",
133"# log(model_name)\n",
134"# model = MODELS[type][model_name]\n",
135"# if is_class(model):\n",
136"# model = model()\n",
137"# model = Timing(model)\n",
138"\n",
139"# for dataset_name in DATASETS:\n",
140"# path = join_path(DATA_DIR, type, DATASET, dataset_name + JL + GZ)\n",
141"# lines = load_gz_lines(path)\n",
142"# records = parse_partitions(lines)\n",
143"# records = log_progress(records, desc=dataset_name)\n",
144"\n",
145"# records = (\n",
146"# Partition.from_substrings(model(_.text))\n",
147"# for _ in records\n",
148"# )\n",
149"# path = join_path(DATA_DIR, type, model_name, dataset_name + JL + GZ)\n",
150"# lines = format_partitions(records)\n",
151"# dump_gz_lines(lines, path)\n",
152"\n",
153"# path = join_path(DATA_DIR, STATS + JL)\n",
154"# record = [[type, model_name, dataset_name], model.time]\n",
155"# lines = format_jl([record])\n",
156"# append_lines(lines, path)\n",
157"\n",
158"# model.reset()"
159]
160},
161{
162"cell_type": "markdown",
163"metadata": {},
164"source": [
165"# score"
166]
167},
168{
169"cell_type": "code",
170"execution_count": null,
171"metadata": {},
172"outputs": [],
173"source": [
174"scores = {}\n",
175"keys = [\n",
176" (type, model, dataset)\n",
177" for type in [TOKEN, SENT]\n",
178" for model in MODELS[type]\n",
179" for dataset in DATASETS\n",
180"]\n",
181"for type, model, dataset in log_progress(keys):\n",
182" path = join_path(DATA_DIR, type, DATASET, dataset + JL + GZ)\n",
183" lines = load_gz_lines(path)\n",
184" targets = parse_partitions(lines)\n",
185"\n",
186" path = join_path(DATA_DIR, type, model, dataset + JL + GZ)\n",
187" lines = load_gz_lines(path)\n",
188" preds = parse_partitions(lines)\n",
189"\n",
190" score = score_partitions(preds, targets)\n",
191" scores[type, model, dataset] = score"
192]
193},
194{
195"cell_type": "markdown",
196"metadata": {},
197"source": [
198"# report"
199]
200},
201{
202"cell_type": "code",
203"execution_count": null,
204"metadata": {},
205"outputs": [],
206"source": [
207"path = join_path(DATA_DIR, STATS + JL)\n",
208"lines = load_lines(path)\n",
209"items = parse_jl(lines)\n",
210"times = {\n",
211" tuple(key): time\n",
212" for key, time in items\n",
213"}"
214]
215},
216{
217"cell_type": "code",
218"execution_count": null,
219"metadata": {},
220"outputs": [],
221"source": [
222"token_labels = dict(model_labels(MODELS, TOKEN))\n",
223"token_table = report_table(scores, times, DATASETS, MODELS, TOKEN)\n",
224"html = format_report(token_table, token_labels)\n",
225"patch_readme(TOKEN, html, README)\n",
226"patch_readme(TOKEN, html, RAZDEL_README)\n",
227"HTML(html)"
228]
229},
230{
231"cell_type": "code",
232"execution_count": null,
233"metadata": {},
234"outputs": [],
235"source": [
236"sent_labels = dict(model_labels(MODELS, SENT))\n",
237"sent_table = report_table(scores, times, DATASETS, MODELS, SENT)\n",
238"html = format_report(sent_table, sent_labels)\n",
239"patch_readme(SENT, html, README)\n",
240"patch_readme(SENT, html, RAZDEL_README)\n",
241"HTML(html)"
242]
243},
244{
245"cell_type": "code",
246"execution_count": null,
247"metadata": {},
248"outputs": [],
249"source": []
250}
251],
252"metadata": {
253"kernelspec": {
254"display_name": "Python 3",
255"language": "python",
256"name": "python3"
257},
258"language_info": {
259"codemirror_mode": {
260"name": "ipython",
261"version": 3
262},
263"file_extension": ".py",
264"mimetype": "text/x-python",
265"name": "python",
266"nbconvert_exporter": "python",
267"pygments_lexer": "ipython3",
268"version": "3.6.9"
269}
270},
271"nbformat": 4,
272"nbformat_minor": 2
273}
274