oreilly-gpt-hands-on-nlg
462 строки · 13.2 Кб
1{
2"cells": [
3{
4"cell_type": "markdown",
5"id": "demographic-kenya",
6"metadata": {},
7"source": [
8"## GPT2 trained on a Dialogue corpus\n",
9"\n",
10"[Huggingface repo here](https://huggingface.co/microsoft/DialoGPT-large)"
11]
12},
13{
14"cell_type": "code",
15"execution_count": 3,
16"id": "contained-convenience",
17"metadata": {},
18"outputs": [],
19"source": [
20"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
21"import torch\n",
22"from transformers import pipeline \n",
23"\n",
24"tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-large')\n",
25"model = AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-large')\n"
26]
27},
28{
29"cell_type": "code",
30"execution_count": 4,
31"id": "realistic-leone",
32"metadata": {},
33"outputs": [
34{
35"name": "stdout",
36"output_type": "stream",
37"text": [
38">> User:Hey what are you up to?\n",
39"DialoGPT: Nothing much. You?\n",
40">> User:Nothing\n",
41"DialoGPT: That's good.\n"
42]
43}
44],
45"source": [
46"# Let's chat for 2 turns\n",
47"for step in range(2):\n",
48" # encode the new user input, add the eos_token and return a tensor in Pytorch\n",
49" new_user_input_ids = tokenizer.encode(input(\">> User:\") + tokenizer.eos_token, return_tensors='pt')\n",
50"\n",
51" # append the new user input tokens to the chat history\n",
52" bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids\n",
53"\n",
54" # generated a response while limiting the total chat history to 1000 tokens, \n",
55" chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)\n",
56"\n",
57" # pretty print last ouput tokens from bot\n",
58" print(\"DialoGPT: {}\".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))\n"
59]
60},
61{
62"cell_type": "code",
63"execution_count": 7,
64"id": "experienced-creator",
65"metadata": {},
66"outputs": [
67{
68"data": {
69"text/plain": [
70"'Hey what are you up to?<|endoftext|>Nothing much. You?<|endoftext|>Nothing<|endoftext|>'"
71]
72},
73"execution_count": 7,
74"metadata": {},
75"output_type": "execute_result"
76}
77],
78"source": [
79"tokenizer.decode(bot_input_ids[0]) # note the endoftext tokens"
80]
81},
82{
83"cell_type": "markdown",
84"id": "auburn-minutes",
85"metadata": {},
86"source": [
87"## Turkish GPT2\n",
88"\n",
89"[Huggingface repo here](https://huggingface.co/redrussianarmy/gpt2-turkish-cased)"
90]
91},
92{
93"cell_type": "code",
94"execution_count": 16,
95"id": "wanted-consortium",
96"metadata": {},
97"outputs": [
98{
99"name": "stderr",
100"output_type": "stream",
101"text": [
102"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
103"Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
104]
105},
106{
107"name": "stdout",
108"output_type": "stream",
109"text": [
110"Merhaba. Ben Bilmem\n"
111]
112}
113],
114"source": [
115"turkish_tokenizer = AutoTokenizer.from_pretrained(\"redrussianarmy/gpt2-turkish-cased\")\n",
116"\n",
117"turkish_model = AutoModelForCausalLM.from_pretrained(\"redrussianarmy/gpt2-turkish-cased\")\n",
118"\n",
119"turkish_generator = pipeline(\n",
120" 'text-generation', model=turkish_model, tokenizer=turkish_tokenizer\n",
121")\n",
122"\n",
123"print(turkish_generator('Merhaba. Ben', max_length=5)[0]['generated_text']) # Hi. I wouldn't know"
124]
125},
126{
127"cell_type": "code",
128"execution_count": null,
129"id": "compound-painting",
130"metadata": {},
131"outputs": [],
132"source": []
133},
134{
135"cell_type": "markdown",
136"id": "streaming-partition",
137"metadata": {},
138"source": [
139"## Python code completion\n",
140"\n",
141"[Huggingface repo here](https://huggingface.co/Sentdex/GPyT)"
142]
143},
144{
145"cell_type": "code",
146"execution_count": 4,
147"id": "italian-delhi",
148"metadata": {},
149"outputs": [],
150"source": [
151"tokenizer = AutoTokenizer.from_pretrained(\"Sentdex/GPyT\")\n",
152"model = AutoModelForCausalLM.from_pretrained(\"Sentdex/GPyT\")"
153]
154},
155{
156"cell_type": "code",
157"execution_count": 5,
158"id": "authentic-prophet",
159"metadata": {},
160"outputs": [
161{
162"name": "stderr",
163"output_type": "stream",
164"text": [
165"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
166]
167},
168{
169"name": "stdout",
170"output_type": "stream",
171"text": [
172"import pandas as pd\n",
173"import numpy as np\n",
174"\n",
175"df = pd.read_csv('data/data/data\n"
176]
177}
178],
179"source": [
180"input_code = \"\"\"import pandas as pd\n",
181"import numpy as np\n",
182"\n",
183"df = pd\"\"\" # I'd expect a read_csv here\n",
184"\n",
185"converted = input_code.replace(\"\\n\", \"<N>\")\n",
186"tokenized = tokenizer.encode(converted, return_tensors='pt')\n",
187"resp = model.generate(tokenized, beams=3, max_length=tokenized.shape[1] + 10)\n",
188"\n",
189"decoded = tokenizer.decode(resp[0])\n",
190"reformatted = decoded.replace(\"<N>\",\"\\n\")\n",
191"\n",
192"print(reformatted)"
193]
194},
195{
196"cell_type": "code",
197"execution_count": null,
198"id": "e613ac37",
199"metadata": {},
200"outputs": [],
201"source": []
202},
203{
204"cell_type": "code",
205"execution_count": 6,
206"id": "2e7b4e10",
207"metadata": {},
208"outputs": [],
209"source": [
210"# Examples inspired by https://nlpcloud.io/effectively-using-gpt-j-gpt-neo-gpt-3-alternatives-few-shot-learning.html"
211]
212},
213{
214"cell_type": "code",
215"execution_count": 10,
216"id": "a870d1f3",
217"metadata": {},
218"outputs": [],
219"source": [
220"'''\n",
221"https://huggingface.co/EleutherAI/gpt-neo-1.3B\n",
222"\n",
223"GPT-Neo 1.3B is a transformer model designed using EleutherAI's replication of the GPT-3 architecture. \n",
224"GPT-Neo refers to the class of models, while 1.3B represents the number of parameters of this particular \n",
225"pre-trained model.\n",
226"\n",
227"GPT-Neo 1.3B was trained on the Pile, a large scale curated dataset created by EleutherAI \n",
228"for the purpose of training this model. https://pile.eleuther.ai\n",
229"'''\n",
230"\n",
231"tokenizer = AutoTokenizer.from_pretrained(\"EleutherAI/gpt-neo-1.3B\")\n",
232"\n",
233"model = AutoModelForCausalLM.from_pretrained(\"EleutherAI/gpt-neo-1.3B\")\n",
234"\n",
235"gpt_neo = pipeline(\n",
236" 'text-generation', model=model, tokenizer=tokenizer\n",
237")"
238]
239},
240{
241"cell_type": "code",
242"execution_count": 8,
243"id": "e3093afd",
244"metadata": {},
245"outputs": [
246{
247"name": "stderr",
248"output_type": "stream",
249"text": [
250"Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
251]
252},
253{
254"name": "stdout",
255"output_type": "stream",
256"text": [
257"I love goin to the beach.\n",
258"Correction: I love going to the beach.\n",
259"###\n",
260"Let me hav it!\n",
261"Correction: Let me have it!\n",
262"###\n",
263"It have too many drawbacks.\n",
264"Correction: It has too many drawbacks.\n",
265"###\n",
266"I do not wan to go\n",
267"Correction: I do not want to go.\n",
268"###\n"
269]
270}
271],
272"source": [
273"# spelling correction\n",
274"for result in gpt_neo(\"\"\"I love goin to the beach.\n",
275"Correction: I love going to the beach.\n",
276"###\n",
277"Let me hav it!\n",
278"Correction: Let me have it!\n",
279"###\n",
280"It have too many drawbacks.\n",
281"Correction: It has too many drawbacks.\n",
282"###\n",
283"I do not wan to go\n",
284"Correction:\"\"\",\n",
285" max_length=75, early_stopping=True):\n",
286" print(result['generated_text'])"
287]
288},
289{
290"cell_type": "code",
291"execution_count": 11,
292"id": "239edaad",
293"metadata": {},
294"outputs": [
295{
296"name": "stderr",
297"output_type": "stream",
298"text": [
299"Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
300]
301},
302{
303"name": "stdout",
304"output_type": "stream",
305"text": [
306"I want to start coding tomorrow because it seems to be so fun!\n",
307"Intent: start coding\n",
308"###\n",
309"Show me the last pictures you have please.\n",
310"Intent: show pictures\n",
311"###\n",
312"Search all these files as fast as possible.\n",
313"Intent: search files\n",
314"###\n",
315"Can you please teach me Chinese next week?\n",
316"Intent: learn Chinese next week\n",
317"###\n",
318"Please show me some of your games.\n",
319"Intent\n"
320]
321}
322],
323"source": [
324"# intent detection\n",
325"for result in gpt_neo(\"\"\"I want to start coding tomorrow because it seems to be so fun!\n",
326"Intent: start coding\n",
327"###\n",
328"Show me the last pictures you have please.\n",
329"Intent: show pictures\n",
330"###\n",
331"Search all these files as fast as possible.\n",
332"Intent: search files\n",
333"###\n",
334"Can you please teach me Chinese next week?\n",
335"Intent:\"\"\",\n",
336" max_length=90, early_stopping=True):\n",
337" print(result['generated_text'])"
338]
339},
340{
341"cell_type": "code",
342"execution_count": 10,
343"id": "41d67312",
344"metadata": {},
345"outputs": [
346{
347"name": "stderr",
348"output_type": "stream",
349"text": [
350"Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
351]
352},
353{
354"name": "stdout",
355"output_type": "stream",
356"text": [
357"description: a red button that says stop\n",
358"code: <button style=color:white; background-color:red;>Stop</button>\n",
359"###\n",
360"description: a blue box that contains yellow circles with red borders\n",
361"code: <div style=background-color: blue; padding: 20px;><div style=background-color: yellow; border: 5px solid red; border-radius: 50%; padding: 20px; width: 100px; height: 100px;>\n",
362"###\n",
363"description: a Headline saying Welcome to AI\n",
364"code: <span style=font-size: large;><h1><b>AI</b></h1>\n",
365"###\n",
366"description: a Headline saying Welcome\n"
367]
368}
369],
370"source": [
371"for result in gpt_neo(\"\"\"description: a red button that says stop\n",
372"code: <button style=color:white; background-color:red;>Stop</button>\n",
373"###\n",
374"description: a blue box that contains yellow circles with red borders\n",
375"code: <div style=background-color: blue; padding: 20px;><div style=background-color: yellow; border: 5px solid red; border-radius: 50%; padding: 20px; width: 100px; height: 100px;>\n",
376"###\n",
377"description: a Headline saying Welcome to AI\n",
378"code:\"\"\",\n",
379" max_length=150, early_stopping=True):\n",
380" print(result['generated_text'])\n",
381" \n"
382]
383},
384{
385"cell_type": "code",
386"execution_count": 11,
387"id": "198d9d23",
388"metadata": {},
389"outputs": [
390{
391"name": "stderr",
392"output_type": "stream",
393"text": [
394"Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
395]
396},
397{
398"name": "stdout",
399"output_type": "stream",
400"text": [
401"HTML code\n",
402"description: a red button that says stop\n",
403"code: <button style=color:white; background-color:red;>Stop</button>\n",
404"###\n",
405"description: a blue box that contains yellow circles with red borders\n",
406"code: <div style=background-color: blue; padding: 20px;><div style=background-color: yellow; border: 5px solid red; border-radius: 50%; padding: 20px; width: 100px; height: 100px;>\n",
407"###\n",
408"description: a Headline saying Welcome to AI\n",
409"code: <h2>Welcome to AI</h2>\n",
410"###\n",
411"description: a circle with text saying something else\n",
412"code: <circle id='\n"
413]
414}
415],
416"source": [
417"# I will tweak their example a litte bit to add a prompt. Some Sinan wisdom. The headline is much simpler code now :)\n",
418"for result in gpt_neo(\"\"\"HTML code\n",
419"description: a red button that says stop\n",
420"code: <button style=color:white; background-color:red;>Stop</button>\n",
421"###\n",
422"description: a blue box that contains yellow circles with red borders\n",
423"code: <div style=background-color: blue; padding: 20px;><div style=background-color: yellow; border: 5px solid red; border-radius: 50%; padding: 20px; width: 100px; height: 100px;>\n",
424"###\n",
425"description: a Headline saying Welcome to AI\n",
426"code:\"\"\",\n",
427" max_length=150, early_stopping=True):\n",
428" print(result['generated_text'])\n",
429" \n"
430]
431},
432{
433"cell_type": "code",
434"execution_count": null,
435"id": "c9921ae8",
436"metadata": {},
437"outputs": [],
438"source": []
439}
440],
441"metadata": {
442"kernelspec": {
443"display_name": "Python 3 (ipykernel)",
444"language": "python",
445"name": "python3"
446},
447"language_info": {
448"codemirror_mode": {
449"name": "ipython",
450"version": 3
451},
452"file_extension": ".py",
453"mimetype": "text/x-python",
454"name": "python",
455"nbconvert_exporter": "python",
456"pygments_lexer": "ipython3",
457"version": "3.11.5"
458}
459},
460"nbformat": 4,
461"nbformat_minor": 5
462}
463