simpletransformers
159 строк · 8.8 Кб
1{
2"metadata": {
3"language_info": {
4"codemirror_mode": {
5"name": "ipython",
6"version": 3
7},
8"file_extension": ".py",
9"mimetype": "text/x-python",
10"name": "python",
11"nbconvert_exporter": "python",
12"pygments_lexer": "ipython3",
13"version": "3.7.7-final"
14},
15"orig_nbformat": 2,
16"kernelspec": {
17"name": "Python 3.7.7 64-bit ('st': conda)",
18"display_name": "Python 3.7.7 64-bit ('st': conda)",
19"metadata": {
20"interpreter": {
21"hash": "cd15d060c920a31a2004e0f1a0ebfcce168f4cda13577a5cbc6fc38473eace97"
22}
23}
24}
25},
26"nbformat": 4,
27"nbformat_minor": 2,
28"cells": [
29{
30"cell_type": "code",
31"execution_count": 1,
32"metadata": {},
33"outputs": [],
34"source": [
35"import os\n",
36"import pandas as pd"
37]
38},
39{
40"cell_type": "code",
41"execution_count": 2,
42"metadata": {},
43"outputs": [],
44"source": [
45"def prepare_translation_datasets(data_path):\n",
46" with open(os.path.join(data_path, \"train.trg\"), \"r\", encoding=\"utf-8\") as f:\n",
47" sinhala_text = f.readlines()\n",
48" sinhala_text = [text.strip(\"\\n\") for text in sinhala_text]\n",
49"\n",
50" with open(os.path.join(data_path, \"train.src\"), \"r\") as f:\n",
51" english_text = f.readlines()\n",
52" english_text = [text.strip(\"\\n\") for text in english_text]\n",
53"\n",
54" data = []\n",
55" for sinhala, english in zip(sinhala_text, english_text):\n",
56" data.append([\"translate sinhala to english\", sinhala, english])\n",
57" data.append([\"translate english to sinhala\", english, sinhala])\n",
58"\n",
59" train_df = pd.DataFrame(data, columns=[\"prefix\", \"input_text\", \"target_text\"])\n",
60"\n",
61" with open(os.path.join(data_path, \"test.trg\"), \"r\", encoding=\"utf-8\") as f:\n",
62" sinhala_text = f.readlines()\n",
63" sinhala_text = [text.strip(\"\\n\") for text in sinhala_text]\n",
64"\n",
65" with open(os.path.join(data_path, \"test.src\"), \"r\") as f:\n",
66" english_text = f.readlines()\n",
67" english_text = [text.strip(\"\\n\") for text in english_text]\n",
68"\n",
69" data = []\n",
70" for sinhala, english in zip(sinhala_text, english_text):\n",
71" data.append([\"translate sinhala to english\", sinhala, english])\n",
72" data.append([\"translate english to sinhala\", english, sinhala])\n",
73"\n",
74" eval_df = pd.DataFrame(data, columns=[\"prefix\", \"input_text\", \"target_text\"])\n",
75"\n",
76" return train_df, eval_df"
77]
78},
79{
80"cell_type": "code",
81"execution_count": 3,
82"metadata": {},
83"outputs": [],
84"source": [
85"train_df, eval_df = prepare_translation_datasets(\"data/eng-sin\")"
86]
87},
88{
89"cell_type": "code",
90"execution_count": 4,
91"metadata": {},
92"outputs": [
93{
94"output_type": "execute_result",
95"data": {
96"text/plain": [
97" prefix \\\n",
98"0 translate sinhala to english \n",
99"1 translate english to sinhala \n",
100"2 translate sinhala to english \n",
101"3 translate english to sinhala \n",
102"4 translate sinhala to english \n",
103"... ... \n",
104"2255049 translate english to sinhala \n",
105"2255050 translate sinhala to english \n",
106"2255051 translate english to sinhala \n",
107"2255052 translate sinhala to english \n",
108"2255053 translate english to sinhala \n",
109"\n",
110" input_text \\\n",
111"0 මෙය සිදු වන්නේ කවදාද? \n",
112"1 When will this happen? \n",
113"2 දුවන්න ! \n",
114"3 Run! \n",
115"4 - අනිද්දට \n",
116"... ... \n",
117"2255049 As those words indicate, some Christians will ... \n",
118"2255050 එතකොට ඔයා මොකද කරන්නේ? \n",
119"2255051 Then what are you to do? \n",
120"2255052 ඔහු ජාන විද්යාඥයෙක්, මානව ජානවිද්යාව ගැන විශ... \n",
121"2255053 Only that he's a genecist, an expert on human ... \n",
122"\n",
123" target_text \n",
124"0 When will this happen? \n",
125"1 මෙය සිදු වන්නේ කවදාද? \n",
126"2 Run! \n",
127"3 දුවන්න ! \n",
128"4 - The day after tomorrow. \n",
129"... ... \n",
130"2255049 ඒ විරුද්ධවාදිකම් ආගමික නැත්නම් දේශපාලන නයකයන්ග... \n",
131"2255050 Then what are you to do? \n",
132"2255051 එතකොට ඔයා මොකද කරන්නේ? \n",
133"2255052 Only that he's a genecist, an expert on human ... \n",
134"2255053 ඔහු ජාන විද්යාඥයෙක්, මානව ජානවිද්යාව ගැන විශ... \n",
135"\n",
136"[2255054 rows x 3 columns]"
137],
138"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>prefix</th>\n <th>input_text</th>\n <th>target_text</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>translate sinhala to english</td>\n <td>මෙය සිදු වන්නේ කවදාද?</td>\n <td>When will this happen?</td>\n </tr>\n <tr>\n <th>1</th>\n <td>translate english to sinhala</td>\n <td>When will this happen?</td>\n <td>මෙය සිදු වන්නේ කවදාද?</td>\n </tr>\n <tr>\n <th>2</th>\n <td>translate sinhala to english</td>\n <td>දුවන්න !</td>\n <td>Run!</td>\n </tr>\n <tr>\n <th>3</th>\n <td>translate english to sinhala</td>\n <td>Run!</td>\n <td>දුවන්න !</td>\n </tr>\n <tr>\n <th>4</th>\n <td>translate sinhala to english</td>\n <td>- අනිද්දට</td>\n <td>- The day after tomorrow.</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>2255049</th>\n <td>translate english to sinhala</td>\n <td>As those words indicate, some Christians will ...</td>\n <td>ඒ විරුද්ධවාදිකම් ආගමික නැත්නම් දේශපාලන නයකයන්ග...</td>\n </tr>\n <tr>\n <th>2255050</th>\n <td>translate sinhala to english</td>\n <td>එතකොට ඔයා මොකද කරන්නේ?</td>\n <td>Then what are you to do?</td>\n </tr>\n <tr>\n <th>2255051</th>\n <td>translate english to sinhala</td>\n <td>Then what are you to do?</td>\n <td>එතකොට ඔයා මොකද කරන්නේ?</td>\n </tr>\n <tr>\n <th>2255052</th>\n <td>translate sinhala to english</td>\n <td>ඔහු ජාන විද්යාඥයෙක්, මානව ජානවිද්යාව ගැන විශ...</td>\n <td>Only that he's a genecist, an expert on human ...</td>\n </tr>\n <tr>\n <th>2255053</th>\n <td>translate english to sinhala</td>\n <td>Only that he's a genecist, an expert on human ...</td>\n <td>ඔහු ජාන විද්යාඥයෙක්, මානව ජානවිද්යාව ගැන විශ...</td>\n </tr>\n </tbody>\n</table>\n<p>2255054 rows × 3 columns</p>\n</div>"
139},
140"metadata": {},
141"execution_count": 4
142}
143],
144"source": [
145"train_df"
146]
147},
148{
149"cell_type": "code",
150"execution_count": null,
151"metadata": {},
152"outputs": [],
153"source": [
154"train_df.to_csv(\"data/train.tsv\", sep=\"\\t\")\n",
155"eval_df.to_csv(\"data/eval.tsv\", sep=\"\\t\")"
156]
157}
158]
159}