oreilly-gpt-hands-on-nlg

Third_Party_Models.ipynb
462 строки · 13.2 Кб
Перенос по словам
1
{
2
 "cells": [
3
  {
4
   "cell_type": "markdown",
5
   "id": "demographic-kenya",
6
   "metadata": {},
7
   "source": [
8
    "## GPT2 trained on a Dialogue corpus\n",
9
    "\n",
10
    "[Huggingface repo here](https://huggingface.co/microsoft/DialoGPT-large)"
11
   ]
12
  },
13
  {
14
   "cell_type": "code",
15
   "execution_count": 3,
16
   "id": "contained-convenience",
17
   "metadata": {},
18
   "outputs": [],
19
   "source": [
20
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
21
    "import torch\n",
22
    "from transformers import pipeline  \n",
23
    "\n",
24
    "tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-large')\n",
25
    "model = AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-large')\n"
26
   ]
27
  },
28
  {
29
   "cell_type": "code",
30
   "execution_count": 4,
31
   "id": "realistic-leone",
32
   "metadata": {},
33
   "outputs": [
34
    {
35
     "name": "stdout",
36
     "output_type": "stream",
37
     "text": [
38
      ">> User:Hey what are you up to?\n",
39
      "DialoGPT: Nothing much. You?\n",
40
      ">> User:Nothing\n",
41
      "DialoGPT: That's good.\n"
42
     ]
43
    }
44
   ],
45
   "source": [
46
    "# Let's chat for 2 turns\n",
47
    "for step in range(2):\n",
48
    "    # encode the new user input, add the eos_token and return a tensor in Pytorch\n",
49
    "    new_user_input_ids = tokenizer.encode(input(\">> User:\") + tokenizer.eos_token, return_tensors='pt')\n",
50
    "\n",
51
    "    # append the new user input tokens to the chat history\n",
52
    "    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids\n",
53
    "\n",
54
    "    # generated a response while limiting the total chat history to 1000 tokens, \n",
55
    "    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)\n",
56
    "\n",
57
    "    # pretty print last ouput tokens from bot\n",
58
    "    print(\"DialoGPT: {}\".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))\n"
59
   ]
60
  },
61
  {
62
   "cell_type": "code",
63
   "execution_count": 7,
64
   "id": "experienced-creator",
65
   "metadata": {},
66
   "outputs": [
67
    {
68
     "data": {
69
      "text/plain": [
70
       "'Hey what are you up to?<|endoftext|>Nothing much. You?<|endoftext|>Nothing<|endoftext|>'"
71
      ]
72
     },
73
     "execution_count": 7,
74
     "metadata": {},
75
     "output_type": "execute_result"
76
    }
77
   ],
78
   "source": [
79
    "tokenizer.decode(bot_input_ids[0])  # note the endoftext tokens"
80
   ]
81
  },
82
  {
83
   "cell_type": "markdown",
84
   "id": "auburn-minutes",
85
   "metadata": {},
86
   "source": [
87
    "## Turkish GPT2\n",
88
    "\n",
89
    "[Huggingface repo here](https://huggingface.co/redrussianarmy/gpt2-turkish-cased)"
90
   ]
91
  },
92
  {
93
   "cell_type": "code",
94
   "execution_count": 16,
95
   "id": "wanted-consortium",
96
   "metadata": {},
97
   "outputs": [
98
    {
99
     "name": "stderr",
100
     "output_type": "stream",
101
     "text": [
102
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
103
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
104
     ]
105
    },
106
    {
107
     "name": "stdout",
108
     "output_type": "stream",
109
     "text": [
110
      "Merhaba. Ben Bilmem\n"
111
     ]
112
    }
113
   ],
114
   "source": [
115
    "turkish_tokenizer = AutoTokenizer.from_pretrained(\"redrussianarmy/gpt2-turkish-cased\")\n",
116
    "\n",
117
    "turkish_model = AutoModelForCausalLM.from_pretrained(\"redrussianarmy/gpt2-turkish-cased\")\n",
118
    "\n",
119
    "turkish_generator = pipeline(\n",
120
    "    'text-generation', model=turkish_model, tokenizer=turkish_tokenizer\n",
121
    ")\n",
122
    "\n",
123
    "print(turkish_generator('Merhaba. Ben', max_length=5)[0]['generated_text'])  # Hi. I wouldn't know"
124
   ]
125
  },
126
  {
127
   "cell_type": "code",
128
   "execution_count": null,
129
   "id": "compound-painting",
130
   "metadata": {},
131
   "outputs": [],
132
   "source": []
133
  },
134
  {
135
   "cell_type": "markdown",
136
   "id": "streaming-partition",
137
   "metadata": {},
138
   "source": [
139
    "## Python code completion\n",
140
    "\n",
141
    "[Huggingface repo here](https://huggingface.co/Sentdex/GPyT)"
142
   ]
143
  },
144
  {
145
   "cell_type": "code",
146
   "execution_count": 4,
147
   "id": "italian-delhi",
148
   "metadata": {},
149
   "outputs": [],
150
   "source": [
151
    "tokenizer = AutoTokenizer.from_pretrained(\"Sentdex/GPyT\")\n",
152
    "model = AutoModelForCausalLM.from_pretrained(\"Sentdex/GPyT\")"
153
   ]
154
  },
155
  {
156
   "cell_type": "code",
157
   "execution_count": 5,
158
   "id": "authentic-prophet",
159
   "metadata": {},
160
   "outputs": [
161
    {
162
     "name": "stderr",
163
     "output_type": "stream",
164
     "text": [
165
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
166
     ]
167
    },
168
    {
169
     "name": "stdout",
170
     "output_type": "stream",
171
     "text": [
172
      "import pandas as pd\n",
173
      "import numpy as np\n",
174
      "\n",
175
      "df = pd.read_csv('data/data/data\n"
176
     ]
177
    }
178
   ],
179
   "source": [
180
    "input_code = \"\"\"import pandas as pd\n",
181
    "import numpy as np\n",
182
    "\n",
183
    "df = pd\"\"\"  # I'd expect a read_csv here\n",
184
    "\n",
185
    "converted = input_code.replace(\"\\n\", \"<N>\")\n",
186
    "tokenized = tokenizer.encode(converted, return_tensors='pt')\n",
187
    "resp = model.generate(tokenized, beams=3, max_length=tokenized.shape[1] + 10)\n",
188
    "\n",
189
    "decoded = tokenizer.decode(resp[0])\n",
190
    "reformatted = decoded.replace(\"<N>\",\"\\n\")\n",
191
    "\n",
192
    "print(reformatted)"
193
   ]
194
  },
195
  {
196
   "cell_type": "code",
197
   "execution_count": null,
198
   "id": "e613ac37",
199
   "metadata": {},
200
   "outputs": [],
201
   "source": []
202
  },
203
  {
204
   "cell_type": "code",
205
   "execution_count": 6,
206
   "id": "2e7b4e10",
207
   "metadata": {},
208
   "outputs": [],
209
   "source": [
210
    "# Examples inspired by https://nlpcloud.io/effectively-using-gpt-j-gpt-neo-gpt-3-alternatives-few-shot-learning.html"
211
   ]
212
  },
213
  {
214
   "cell_type": "code",
215
   "execution_count": 10,
216
   "id": "a870d1f3",
217
   "metadata": {},
218
   "outputs": [],
219
   "source": [
220
    "'''\n",
221
    "https://huggingface.co/EleutherAI/gpt-neo-1.3B\n",
222
    "\n",
223
    "GPT-Neo 1.3B is a transformer model designed using EleutherAI's replication of the GPT-3 architecture. \n",
224
    "GPT-Neo refers to the class of models, while 1.3B represents the number of parameters of this particular \n",
225
    "pre-trained model.\n",
226
    "\n",
227
    "GPT-Neo 1.3B was trained on the Pile, a large scale curated dataset created by EleutherAI \n",
228
    "for the purpose of training this model. https://pile.eleuther.ai\n",
229
    "'''\n",
230
    "\n",
231
    "tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-neo-1.3B\")\n",
232
    "\n",
233
    "model = AutoModelForCausalLM.from_pretrained(\"EleutherAI/gpt-neo-1.3B\")\n",
234
    "\n",
235
    "gpt_neo = pipeline(\n",
236
    "    'text-generation', model=model, tokenizer=tokenizer\n",
237
    ")"
238
   ]
239
  },
240
  {
241
   "cell_type": "code",
242
   "execution_count": 8,
243
   "id": "e3093afd",
244
   "metadata": {},
245
   "outputs": [
246
    {
247
     "name": "stderr",
248
     "output_type": "stream",
249
     "text": [
250
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
251
     ]
252
    },
253
    {
254
     "name": "stdout",
255
     "output_type": "stream",
256
     "text": [
257
      "I love goin to the beach.\n",
258
      "Correction: I love going to the beach.\n",
259
      "###\n",
260
      "Let me hav it!\n",
261
      "Correction: Let me have it!\n",
262
      "###\n",
263
      "It have too many drawbacks.\n",
264
      "Correction: It has too many drawbacks.\n",
265
      "###\n",
266
      "I do not wan to go\n",
267
      "Correction: I do not want to go.\n",
268
      "###\n"
269
     ]
270
    }
271
   ],
272
   "source": [
273
    "# spelling correction\n",
274
    "for result in gpt_neo(\"\"\"I love goin to the beach.\n",
275
    "Correction: I love going to the beach.\n",
276
    "###\n",
277
    "Let me hav it!\n",
278
    "Correction: Let me have it!\n",
279
    "###\n",
280
    "It have too many drawbacks.\n",
281
    "Correction: It has too many drawbacks.\n",
282
    "###\n",
283
    "I do not wan to go\n",
284
    "Correction:\"\"\",\n",
285
    "    max_length=75, early_stopping=True):\n",
286
    "    print(result['generated_text'])"
287
   ]
288
  },
289
  {
290
   "cell_type": "code",
291
   "execution_count": 11,
292
   "id": "239edaad",
293
   "metadata": {},
294
   "outputs": [
295
    {
296
     "name": "stderr",
297
     "output_type": "stream",
298
     "text": [
299
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
300
     ]
301
    },
302
    {
303
     "name": "stdout",
304
     "output_type": "stream",
305
     "text": [
306
      "I want to start coding tomorrow because it seems to be so fun!\n",
307
      "Intent: start coding\n",
308
      "###\n",
309
      "Show me the last pictures you have please.\n",
310
      "Intent: show pictures\n",
311
      "###\n",
312
      "Search all these files as fast as possible.\n",
313
      "Intent: search files\n",
314
      "###\n",
315
      "Can you please teach me Chinese next week?\n",
316
      "Intent: learn Chinese next week\n",
317
      "###\n",
318
      "Please show me some of your games.\n",
319
      "Intent\n"
320
     ]
321
    }
322
   ],
323
   "source": [
324
    "# intent detection\n",
325
    "for result in gpt_neo(\"\"\"I want to start coding tomorrow because it seems to be so fun!\n",
326
    "Intent: start coding\n",
327
    "###\n",
328
    "Show me the last pictures you have please.\n",
329
    "Intent: show pictures\n",
330
    "###\n",
331
    "Search all these files as fast as possible.\n",
332
    "Intent: search files\n",
333
    "###\n",
334
    "Can you please teach me Chinese next week?\n",
335
    "Intent:\"\"\",\n",
336
    "    max_length=90, early_stopping=True):\n",
337
    "    print(result['generated_text'])"
338
   ]
339
  },
340
  {
341
   "cell_type": "code",
342
   "execution_count": 10,
343
   "id": "41d67312",
344
   "metadata": {},
345
   "outputs": [
346
    {
347
     "name": "stderr",
348
     "output_type": "stream",
349
     "text": [
350
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
351
     ]
352
    },
353
    {
354
     "name": "stdout",
355
     "output_type": "stream",
356
     "text": [
357
      "description: a red button that says stop\n",
358
      "code: <button style=color:white; background-color:red;>Stop</button>\n",
359
      "###\n",
360
      "description: a blue box that contains yellow circles with red borders\n",
361
      "code: <div style=background-color: blue; padding: 20px;><div style=background-color: yellow; border: 5px solid red; border-radius: 50%; padding: 20px; width: 100px; height: 100px;>\n",
362
      "###\n",
363
      "description: a Headline saying Welcome to AI\n",
364
      "code: <span style=font-size: large;><h1><b>AI</b></h1>\n",
365
      "###\n",
366
      "description: a Headline saying Welcome\n"
367
     ]
368
    }
369
   ],
370
   "source": [
371
    "for result in gpt_neo(\"\"\"description: a red button that says stop\n",
372
    "code: <button style=color:white; background-color:red;>Stop</button>\n",
373
    "###\n",
374
    "description: a blue box that contains yellow circles with red borders\n",
375
    "code: <div style=background-color: blue; padding: 20px;><div style=background-color: yellow; border: 5px solid red; border-radius: 50%; padding: 20px; width: 100px; height: 100px;>\n",
376
    "###\n",
377
    "description: a Headline saying Welcome to AI\n",
378
    "code:\"\"\",\n",
379
    "    max_length=150, early_stopping=True):\n",
380
    "    print(result['generated_text'])\n",
381
    "    \n"
382
   ]
383
  },
384
  {
385
   "cell_type": "code",
386
   "execution_count": 11,
387
   "id": "198d9d23",
388
   "metadata": {},
389
   "outputs": [
390
    {
391
     "name": "stderr",
392
     "output_type": "stream",
393
     "text": [
394
      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
395
     ]
396
    },
397
    {
398
     "name": "stdout",
399
     "output_type": "stream",
400
     "text": [
401
      "HTML code\n",
402
      "description: a red button that says stop\n",
403
      "code: <button style=color:white; background-color:red;>Stop</button>\n",
404
      "###\n",
405
      "description: a blue box that contains yellow circles with red borders\n",
406
      "code: <div style=background-color: blue; padding: 20px;><div style=background-color: yellow; border: 5px solid red; border-radius: 50%; padding: 20px; width: 100px; height: 100px;>\n",
407
      "###\n",
408
      "description: a Headline saying Welcome to AI\n",
409
      "code: <h2>Welcome to AI</h2>\n",
410
      "###\n",
411
      "description: a circle with text saying something else\n",
412
      "code: <circle id='\n"
413
     ]
414
    }
415
   ],
416
   "source": [
417
    "# I will tweak their example a litte bit to add a prompt. Some Sinan wisdom. The headline is much simpler code now :)\n",
418
    "for result in gpt_neo(\"\"\"HTML code\n",
419
    "description: a red button that says stop\n",
420
    "code: <button style=color:white; background-color:red;>Stop</button>\n",
421
    "###\n",
422
    "description: a blue box that contains yellow circles with red borders\n",
423
    "code: <div style=background-color: blue; padding: 20px;><div style=background-color: yellow; border: 5px solid red; border-radius: 50%; padding: 20px; width: 100px; height: 100px;>\n",
424
    "###\n",
425
    "description: a Headline saying Welcome to AI\n",
426
    "code:\"\"\",\n",
427
    "    max_length=150, early_stopping=True):\n",
428
    "    print(result['generated_text'])\n",
429
    "    \n"
430
   ]
431
  },
432
  {
433
   "cell_type": "code",
434
   "execution_count": null,
435
   "id": "c9921ae8",
436
   "metadata": {},
437
   "outputs": [],
438
   "source": []
439
  }
440
 ],
441
 "metadata": {
442
  "kernelspec": {
443
   "display_name": "Python 3 (ipykernel)",
444
   "language": "python",
445
   "name": "python3"
446
  },
447
  "language_info": {
448
   "codemirror_mode": {
449
    "name": "ipython",
450
    "version": 3
451
   },
452
   "file_extension": ".py",
453
   "mimetype": "text/x-python",
454
   "name": "python",
455
   "nbconvert_exporter": "python",
456
   "pygments_lexer": "ipython3",
457
   "version": "3.11.5"
458
  }
459
 },
460
 "nbformat": 4,
461
 "nbformat_minor": 5
462
}
463
oreilly-gpt-hands-on-nlg

Использование cookies