slovnet
111 строк · 2.9 Кб
1{
2"cells": [
3{
4"cell_type": "code",
5"execution_count": null,
6"metadata": {},
7"outputs": [],
8"source": [
9"%run main.py\n",
10"%load_ext autoreload\n",
11"%autoreload 2\n",
12"\n",
13"!mkdir -p {RAW_DIR}\n",
14"s3 = S3()"
15]
16},
17{
18"cell_type": "code",
19"execution_count": null,
20"metadata": {},
21"outputs": [],
22"source": [
23"!wget https://storage.yandexcloud.net/natasha-corus/taiga/Fontanka.tar.gz -P {RAW_DIR}\n",
24"!wget https://storage.yandexcloud.net/natasha-corus/ods/gazeta_v1.csv.zip -P {RAW_DIR}\n",
25"!wget https://storage.yandexcloud.net/natasha-corus/ods/interfax_v1.csv.zip -P {RAW_DIR}\n",
26"!wget https://storage.yandexcloud.net/natasha-corus/lenta-ru-news.csv.gz -P {RAW_DIR}\n",
27"!wget https://storage.yandexcloud.net/natasha-corus/buriy/news-articles-2014.tar.bz2 -P {RAW_DIR}\n",
28"!wget https://storage.yandexcloud.net/natasha-corus/buriy/news-articles-2015-part1.tar.bz2 -P {RAW_DIR}\n",
29"!wget https://storage.yandexcloud.net/natasha-corus/buriy/news-articles-2015-part2.tar.bz2 -P {RAW_DIR}"
30]
31},
32{
33"cell_type": "code",
34"execution_count": null,
35"metadata": {},
36"outputs": [],
37"source": [
38"LOADS = {\n",
39" 'gazeta_v1.csv.zip': load_ods_gazeta,\n",
40" 'interfax_v1.csv.zip': load_ods_interfax,\n",
41" 'Fontanka.tar.gz': load_taiga_fontanka,\n",
42" 'lenta-ru-news.csv.gz': load_lenta,\n",
43" 'news-articles-2015-part1.tar.bz2': load_buriy_news,\n",
44" 'news-articles-2015-part2.tar.bz2': load_buriy_news,\n",
45" 'news-articles-2014.tar.bz2': load_buriy_news,\n",
46"}\n",
47"\n",
48"\n",
49"lines = [] # Requires 15Gb RAM\n",
50"for name in listdir(RAW_DIR):\n",
51" path = join_path(RAW_DIR, name)\n",
52" records = LOADS[name](path)\n",
53" for record in log_progress(records, desc=name):\n",
54" line = re.sub('\\s+', ' ', record.text) # news article -> single line\n",
55" lines.append(line)"
56]
57},
58{
59"cell_type": "code",
60"execution_count": null,
61"metadata": {},
62"outputs": [],
63"source": [
64"seed(1)\n",
65"shuffle(lines)"
66]
67},
68{
69"cell_type": "code",
70"execution_count": null,
71"metadata": {},
72"outputs": [],
73"source": [
74"cap = 1000\n",
75"dump_lines(lines[:cap], TEST)\n",
76"dump_lines(log_progress(lines[cap:]), TRAIN)"
77]
78},
79{
80"cell_type": "code",
81"execution_count": null,
82"metadata": {},
83"outputs": [],
84"source": [
85"s3.upload(TEST, S3_TEST)\n",
86"s3.upload(TRAIN, S3_TRAIN)"
87]
88}
89],
90"metadata": {
91"kernelspec": {
92"display_name": "Python 3",
93"language": "python",
94"name": "python3"
95},
96"language_info": {
97"codemirror_mode": {
98"name": "ipython",
99"version": 3
100},
101"file_extension": ".py",
102"mimetype": "text/x-python",
103"name": "python",
104"nbconvert_exporter": "python",
105"pygments_lexer": "ipython3",
106"version": "3.6.9"
107}
108},
109"nbformat": 4,
110"nbformat_minor": 2
111}
112