slovnet
1{
2"cells": [
3{
4"cell_type": "code",
5"execution_count": null,
6"metadata": {},
7"outputs": [],
8"source": [
9"%run main.py\n",
10"%load_ext autoreload\n",
11"%autoreload 2\n",
12"\n",
13"!mkdir -p {RAW_DIR}\n",
14"s3 = S3()"
15]
16},
17{
18"cell_type": "code",
19"execution_count": null,
20"metadata": {},
21"outputs": [],
22"source": [
23"# !wget https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz -P {RAW_DIR}"
24]
25},
26{
27"cell_type": "code",
28"execution_count": null,
29"metadata": {},
30"outputs": [],
31"source": [
32"records = load_nerus(RAW_NERUS)\n",
33"records = log_progress(records, total=NERUS_TOTAL)\n",
34"\n",
35"sents = (\n",
36" sent\n",
37" for record in records\n",
38" for sent in record.sents\n",
39")\n",
40"markups = (adapt_markup(_.syntax) for _ in sents)\n",
41"items = (_.as_json for _ in markups)\n",
42"lines = list(format_jl(items))"
43]
44},
45{
46"cell_type": "code",
47"execution_count": null,
48"metadata": {},
49"outputs": [],
50"source": [
51"seed(1)\n",
52"shuffle(lines)"
53]
54},
55{
56"cell_type": "code",
57"execution_count": null,
58"metadata": {},
59"outputs": [],
60"source": [
61"# dump_gz_lines(log_progress(lines), NERUS)"
62]
63},
64{
65"cell_type": "code",
66"execution_count": null,
67"metadata": {},
68"outputs": [],
69"source": [
70"# s3.upload(NERUS, S3_NERUS)"
71]
72},
73{
74"cell_type": "code",
75"execution_count": null,
76"metadata": {},
77"outputs": [],
78"source": []
79}
80],
81"metadata": {
82"kernelspec": {
83"display_name": "Python 3",
84"language": "python",
85"name": "python3"
86},
87"language_info": {
88"codemirror_mode": {
89"name": "ipython",
90"version": 3
91},
92"file_extension": ".py",
93"mimetype": "text/x-python",
94"name": "python",
95"nbconvert_exporter": "python",
96"pygments_lexer": "ipython3",
97"version": "3.6.9"
98}
99},
100"nbformat": 4,
101"nbformat_minor": 2
102}
103