llm-finetuning

16.Neo4j_and_LangChain_for_Enhanced_Question_Answering.ipynb
479 строк · 13.6 Кб
Перенос по словам
1
{
2
  "cells": [
3
    {
4
      "cell_type": "markdown",
5
      "metadata": {
6
        "id": "LXzvg_0MUzhA"
7
      },
8
      "source": [
9
        "## Integrating Unstructured and Graph Knowledge with Neo4j and LangChain for Enhanced Question Answering"
10
      ]
11
    },
12
    {
13
      "cell_type": "markdown",
14
      "metadata": {
15
        "id": "b0AVqZ9XVC9Z"
16
      },
17
      "source": [
18
        "\n",
19
        "\n",
20
        "#### Installing Dependencies"
21
      ]
22
    },
23
    {
24
      "cell_type": "code",
25
      "execution_count": null,
26
      "metadata": {
27
        "colab": {
28
          "base_uri": "https://localhost:8080/"
29
        },
30
        "id": "F52G_upjVIGt",
31
        "outputId": "9e26d2c5-6294-4c3f-de4f-5a51e8961003"
32
      },
33
      "outputs": [],
34
      "source": [
35
        "# !pip install -qU \\\n",
36
        "#        transformers \\\n",
37
        "#        datasets \\\n",
38
        "#        langchain \\\n",
39
        "#        openai \\\n",
40
        "#        wikipedia \\\n",
41
        "#        tiktoken \\\n",
42
        "#        neo4j \\\n",
43
        "#        python-dotenv"
44
      ]
45
    },
46
    {
47
      "cell_type": "markdown",
48
      "metadata": {
49
        "id": "B_5pjB2WX_DZ"
50
      },
51
      "source": [
52
        "#### Importing Packanges"
53
      ]
54
    },
55
    {
56
      "cell_type": "code",
57
      "execution_count": 2,
58
      "metadata": {
59
        "id": "R-h-iIDmYFGh"
60
      },
61
      "outputs": [
62
        {
63
          "name": "stderr",
64
          "output_type": "stream",
65
          "text": [
66
            "c:\\Users\\ibm26\\anaconda3\\envs\\pharmagpt\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
67
            "  from .autonotebook import tqdm as notebook_tqdm\n",
68
            "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
69
          ]
70
        }
71
      ],
72
      "source": [
73
        "import os\n",
74
        "import re\n",
75
        "from langchain.vectorstores.neo4j_vector import Neo4jVector\n",
76
        "from langchain.document_loaders import WikipediaLoader\n",
77
        "from langchain.embeddings.openai import OpenAIEmbeddings\n",
78
        "from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n",
79
        "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
80
        "from dotenv import load_dotenv"
81
      ]
82
    },
83
    {
84
      "cell_type": "markdown",
85
      "metadata": {
86
        "id": "_JY_gy3BqptG"
87
      },
88
      "source": [
89
        "#### Setting API's in Environment Variable[link text](https://)"
90
      ]
91
    },
92
    {
93
      "cell_type": "code",
94
      "execution_count": 3,
95
      "metadata": {
96
        "id": "BIHTHxNtYGRN"
97
      },
98
      "outputs": [],
99
      "source": [
100
        "load_dotenv()\n",
101
        "# os.environ[\"OPENAI_API_KEY\"] = ''\n",
102
        "os.environ[\"NEO4J_URI\"] = 'bolt://localhost:7687'\n",
103
        "os.environ[\"NEO4J_USERNAME\"] = 'neo4j'\n",
104
        "os.environ[\"NEO4J_PASSWORD\"] = 'docdb@123'"
105
      ]
106
    },
107
    {
108
      "cell_type": "code",
109
      "execution_count": 4,
110
      "metadata": {
111
        "colab": {
112
          "base_uri": "https://localhost:8080/"
113
        },
114
        "id": "lhhy1qO5orHa",
115
        "outputId": "a99bdcfc-56e3-43c1-8926-562af2f19e44"
116
      },
117
      "outputs": [
118
        {
119
          "name": "stdout",
120
          "output_type": "stream",
121
          "text": [
122
            "bolt://localhost:7687\n",
123
            "neo4j\n",
124
            "docdb@123\n"
125
          ]
126
        }
127
      ],
128
      "source": [
129
        "# print(os.getenv('OPENAI_API_KEY'))\n",
130
        "print(os.getenv(\"NEO4J_URI\"))\n",
131
        "print(os.getenv(\"NEO4J_USERNAME\"))\n",
132
        "print(os.getenv('NEO4J_PASSWORD'))"
133
      ]
134
    },
135
    {
136
      "cell_type": "markdown",
137
      "metadata": {
138
        "id": "QiTvOmA8rgeZ"
139
      },
140
      "source": [
141
        "#### Data Preprocessing"
142
      ]
143
    },
144
    {
145
      "cell_type": "code",
146
      "execution_count": 4,
147
      "metadata": {
148
        "colab": {
149
          "base_uri": "https://localhost:8080/"
150
        },
151
        "id": "lUF9fXmvYO1h",
152
        "outputId": "a5b6c25c-5672-455d-e785-4c20148aa013"
153
      },
154
      "outputs": [
155
        {
156
          "name": "stdout",
157
          "output_type": "stream",
158
          "text": [
159
            "Number of tokens: 11\n"
160
          ]
161
        }
162
      ],
163
      "source": [
164
        "from transformers import AutoTokenizer\n",
165
        "\n",
166
        "# Define the tokenizer using \"bert-base-uncased\"\n",
167
        "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
168
        "\n",
169
        "# Function to calculate the number of tokens in a text\n",
170
        "def bert_len(text):\n",
171
        "    tokens = tokenizer.encode(text)\n",
172
        "    return len(tokens)\n",
173
        "\n",
174
        "# Example usage\n",
175
        "input_text = \"This is a sample sentence for tokenization.\"\n",
176
        "num_tokens = bert_len(input_text)\n",
177
        "print(f\"Number of tokens: {num_tokens}\")"
178
      ]
179
    },
180
    {
181
      "cell_type": "code",
182
      "execution_count": 5,
183
      "metadata": {},
184
      "outputs": [],
185
      "source": [
186
        "from langchain.document_loaders import PyPDFLoader\n",
187
        "\n",
188
        "loader = PyPDFLoader(\"./docs/YouCanHaveAnAmazingMemoryLearn.pdf\")\n",
189
        "pages = loader.load_and_split()"
190
      ]
191
    },
192
    {
193
      "cell_type": "code",
194
      "execution_count": null,
195
      "metadata": {
196
        "colab": {
197
          "base_uri": "https://localhost:8080/"
198
        },
199
        "id": "YQqG0o7KbmqZ",
200
        "outputId": "7b8e48d5-df80-4d1a-d192-11ce439fc62a"
201
      },
202
      "outputs": [],
203
      "source": [
204
        "# # Load Wikipedia articles related to \"Leonhard Euler\"\n",
205
        "# raw_documents = WikipediaLoader(query=\"Sachin Tendulkar\").load()\n",
206
        "\n",
207
        "# # Define a text splitter with specific parameters\n",
208
        "# text_splitter = RecursiveCharacterTextSplitter(\n",
209
        "#     chunk_size=200, chunk_overlap=20, length_function=bert_len, separators=['\\n\\n', '\\n', ' ', '']\n",
210
        "# )\n",
211
        "\n",
212
        "# # Split the content of the first Wikipedia article into smaller documents\n",
213
        "# documents = text_splitter.create_documents([raw_documents[0].page_content])"
214
      ]
215
    },
216
    {
217
      "cell_type": "code",
218
      "execution_count": null,
219
      "metadata": {
220
        "colab": {
221
          "base_uri": "https://localhost:8080/"
222
        },
223
        "id": "c68Duv2Nbqqk",
224
        "outputId": "4510be6f-af0c-4c08-e814-da06bf75f7f2"
225
      },
226
      "outputs": [],
227
      "source": [
228
        "print(len(documents))"
229
      ]
230
    },
231
    {
232
      "cell_type": "code",
233
      "execution_count": 6,
234
      "metadata": {},
235
      "outputs": [],
236
      "source": [
237
        "# Define a text splitter with specific parameters\n",
238
        "text_splitter = RecursiveCharacterTextSplitter(\n",
239
        "    chunk_size=1000, chunk_overlap=200, length_function=bert_len, separators=['\\n\\n', '\\n', ' ', '']\n",
240
        ")\n",
241
        "\n",
242
        "# Split the content of the first Wikipedia article into smaller documents\n",
243
        "documents = text_splitter.create_documents([pages[4].page_content])"
244
      ]
245
    },
246
    {
247
      "cell_type": "markdown",
248
      "metadata": {
249
        "id": "X043ugczr0X5"
250
      },
251
      "source": [
252
        "#### Initializing Graph Database Neo4j [link text](https://)"
253
      ]
254
    },
255
    {
256
      "cell_type": "code",
257
      "execution_count": 7,
258
      "metadata": {
259
        "id": "RSHWwlbJcCi2"
260
      },
261
      "outputs": [],
262
      "source": [
263
        "# Instantiate Neo4j vector from documents\n",
264
        "neo4j_vector = Neo4jVector.from_documents(\n",
265
        "    documents,\n",
266
        "    OpenAIEmbeddings(),\n",
267
        "    url=os.environ[\"NEO4J_URI\"],\n",
268
        "    username=os.environ[\"NEO4J_USERNAME\"],\n",
269
        "    password=os.environ[\"NEO4J_PASSWORD\"]\n",
270
        ")"
271
      ]
272
    },
273
    {
274
      "cell_type": "markdown",
275
      "metadata": {
276
        "id": "34Fm9UgHwWdG"
277
      },
278
      "source": [
279
        "#### Peroforming Similarity Search on Ingested Documents"
280
      ]
281
    },
282
    {
283
      "cell_type": "code",
284
      "execution_count": 8,
285
      "metadata": {
286
        "colab": {
287
          "base_uri": "https://localhost:8080/"
288
        },
289
        "id": "qp8bl2hjruzg",
290
        "outputId": "b5922d9e-7f16-4250-f917-9af03d361fa4"
291
      },
292
      "outputs": [
293
        {
294
          "name": "stdout",
295
          "output_type": "stream",
296
          "text": [
297
            "Chapter 26:\n",
298
            " Using the tools: Study and learning\n",
299
            "Chapter 27:\n",
300
            " Using the tools: Everyday ways to train your\n",
301
            "memory\n",
302
            "Chapter 28:\n",
303
            " Using the tools: Just for fun\n",
304
            "Chapter 29:\n",
305
            " Age equals experience, not forgetfulness!\n",
306
            "Chapter 30:\n",
307
            " I’ve done all that, now what can I expect?\n",
308
            "Chapter 31:\n",
309
            " Look at what you can do now!\n",
310
            " \n",
311
            "Afterword: The champions of the future\n",
312
            " \n",
313
            "Index\n",
314
            "\n",
315
            "Chapter 26:\n",
316
            " Using the tools: Study and learning\n",
317
            "Chapter 27:\n",
318
            " Using the tools: Everyday ways to train your\n",
319
            "memory\n",
320
            "Chapter 28:\n",
321
            " Using the tools: Just for fun\n",
322
            "Chapter 29:\n",
323
            " Age equals experience, not forgetfulness!\n",
324
            "Chapter 30:\n",
325
            " I’ve done all that, now what can I expect?\n",
326
            "Chapter 31:\n",
327
            " Look at what you can do now!\n",
328
            " \n",
329
            "Afterword: The champions of the future\n",
330
            " \n",
331
            "Index\n"
332
          ]
333
        }
334
      ],
335
      "source": [
336
        "# Define the query.\n",
337
        "query = \"What is the introduction on book?\"\n",
338
        "\n",
339
        "# Execute the query, get top 2 results.\n",
340
        "vector_results = neo4j_vector.similarity_search(query, k=2)\n",
341
        "\n",
342
        "# Print search results with separation.\n",
343
        "for i, res in enumerate(vector_results):\n",
344
        "    print(res.page_content)\n",
345
        "    if i != len(vector_results) - 1:\n",
346
        "        print()\n",
347
        "\n",
348
        "# Store the content of the most similar result.\n",
349
        "vector_result = vector_results[0].page_content"
350
      ]
351
    },
352
    {
353
      "cell_type": "markdown",
354
      "metadata": {
355
        "id": "yLCySJqcxV3W"
356
      },
357
      "source": [
358
        "#### Building Knowledge Graph"
359
      ]
360
    },
361
    {
362
      "cell_type": "code",
363
      "execution_count": null,
364
      "metadata": {
365
        "id": "EOHIAZrLxh8N"
366
      },
367
      "outputs": [],
368
      "source": [
369
        "# Necessary Libraries to setup the Neo4j DB QuestionAnswering Chain\n",
370
        "from langchain.chat_models import ChatOpenAI\n",
371
        "from langchain.chains import GraphCypherQAChain\n",
372
        "from langchain.graphs import Neo4jGraph"
373
      ]
374
    },
375
    {
376
      "cell_type": "code",
377
      "execution_count": null,
378
      "metadata": {
379
        "id": "N_sqke-SzfEQ"
380
      },
381
      "outputs": [],
382
      "source": [
383
        "# Create a Neo4jGraph object by connecting to a Neo4j database.\n",
384
        "graph = Neo4jGraph(\n",
385
        "    url=\"bolt://localhost:7687\", username=\"neo4j\", password=\"docdb@123\"\n",
386
        ")\n",
387
        "# from py2neo import Graph\n",
388
        "# graph = Graph(os.environ[\"NEO4J_URI\"],\n",
389
        "#               auth = (os.environ[\"NEO4J_USERNAME\"], \n",
390
        "#                       os.environ[\"NEO4J_PASSWORD\"]))"
391
      ]
392
    },
393
    {
394
      "cell_type": "code",
395
      "execution_count": null,
396
      "metadata": {
397
        "colab": {
398
          "base_uri": "https://localhost:8080/"
399
        },
400
        "id": "oHFGTHCEz_UD",
401
        "outputId": "eaf7ba8f-7188-4b5a-f4b3-f786f4a457ae"
402
      },
403
      "outputs": [],
404
      "source": [
405
        "# Print the schema of the Neo4j graph.\n",
406
        "print(graph.schema)"
407
      ]
408
    },
409
    {
410
      "cell_type": "code",
411
      "execution_count": null,
412
      "metadata": {
413
        "id": "wN_9M9fi0OV8"
414
      },
415
      "outputs": [],
416
      "source": [
417
        "# Create a question-answering chain using GPT-3 and a Neo4j graph, with verbose mode enabled.\n",
418
        "chain = GraphCypherQAChain.from_llm(\n",
419
        "    ChatOpenAI(temperature=0.9), graph=graph, verbose=True\n",
420
        ")"
421
      ]
422
    },
423
    {
424
      "cell_type": "code",
425
      "execution_count": null,
426
      "metadata": {
427
        "colab": {
428
          "base_uri": "https://localhost:8080/"
429
        },
430
        "id": "Iw0foDNr0c-v",
431
        "outputId": "8a776932-7768-4191-d2a4-37c2163b9d2e"
432
      },
433
      "outputs": [],
434
      "source": [
435
        "# Use the question-answering chain to query the Neo4j graph.\n",
436
        "graph_result = chain.run(\"What is the book about?\")"
437
      ]
438
    },
439
    {
440
      "cell_type": "code",
441
      "execution_count": null,
442
      "metadata": {
443
        "colab": {
444
          "base_uri": "https://localhost:8080/",
445
          "height": 37
446
        },
447
        "id": "ljHsyK3z0pAf",
448
        "outputId": "7d95b141-ce27-4404-9e09-1f12c07f3ab8"
449
      },
450
      "outputs": [],
451
      "source": [
452
        "graph_result"
453
      ]
454
    }
455
  ],
456
  "metadata": {
457
    "colab": {
458
      "provenance": []
459
    },
460
    "kernelspec": {
461
      "display_name": "Python 3",
462
      "name": "python3"
463
    },
464
    "language_info": {
465
      "codemirror_mode": {
466
        "name": "ipython",
467
        "version": 3
468
      },
469
      "file_extension": ".py",
470
      "mimetype": "text/x-python",
471
      "name": "python",
472
      "nbconvert_exporter": "python",
473
      "pygments_lexer": "ipython3",
474
      "version": "3.9.18"
475
    }
476
  },
477
  "nbformat": 4,
478
  "nbformat_minor": 0
479
}
480
llm-finetuning

Использование cookies