llm-finetuning
/
16.Neo4j_and_LangChain_for_Enhanced_Question_Answering.ipynb
479 строк · 13.6 Кб
1{
2"cells": [
3{
4"cell_type": "markdown",
5"metadata": {
6"id": "LXzvg_0MUzhA"
7},
8"source": [
9"## Integrating Unstructured and Graph Knowledge with Neo4j and LangChain for Enhanced Question Answering"
10]
11},
12{
13"cell_type": "markdown",
14"metadata": {
15"id": "b0AVqZ9XVC9Z"
16},
17"source": [
18"\n",
19"\n",
20"#### Installing Dependencies"
21]
22},
23{
24"cell_type": "code",
25"execution_count": null,
26"metadata": {
27"colab": {
28"base_uri": "https://localhost:8080/"
29},
30"id": "F52G_upjVIGt",
31"outputId": "9e26d2c5-6294-4c3f-de4f-5a51e8961003"
32},
33"outputs": [],
34"source": [
35"# !pip install -qU \\\n",
36"# transformers \\\n",
37"# datasets \\\n",
38"# langchain \\\n",
39"# openai \\\n",
40"# wikipedia \\\n",
41"# tiktoken \\\n",
42"# neo4j \\\n",
43"# python-dotenv"
44]
45},
46{
47"cell_type": "markdown",
48"metadata": {
49"id": "B_5pjB2WX_DZ"
50},
51"source": [
52"#### Importing Packanges"
53]
54},
55{
56"cell_type": "code",
57"execution_count": 2,
58"metadata": {
59"id": "R-h-iIDmYFGh"
60},
61"outputs": [
62{
63"name": "stderr",
64"output_type": "stream",
65"text": [
66"c:\\Users\\ibm26\\anaconda3\\envs\\pharmagpt\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
67" from .autonotebook import tqdm as notebook_tqdm\n",
68"None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
69]
70}
71],
72"source": [
73"import os\n",
74"import re\n",
75"from langchain.vectorstores.neo4j_vector import Neo4jVector\n",
76"from langchain.document_loaders import WikipediaLoader\n",
77"from langchain.embeddings.openai import OpenAIEmbeddings\n",
78"from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n",
79"from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n",
80"from dotenv import load_dotenv"
81]
82},
83{
84"cell_type": "markdown",
85"metadata": {
86"id": "_JY_gy3BqptG"
87},
88"source": [
89"#### Setting API's in Environment Variable[link text](https://)"
90]
91},
92{
93"cell_type": "code",
94"execution_count": 3,
95"metadata": {
96"id": "BIHTHxNtYGRN"
97},
98"outputs": [],
99"source": [
100"load_dotenv()\n",
101"# os.environ[\"OPENAI_API_KEY\"] = ''\n",
102"os.environ[\"NEO4J_URI\"] = 'bolt://localhost:7687'\n",
103"os.environ[\"NEO4J_USERNAME\"] = 'neo4j'\n",
104"os.environ[\"NEO4J_PASSWORD\"] = 'docdb@123'"
105]
106},
107{
108"cell_type": "code",
109"execution_count": 4,
110"metadata": {
111"colab": {
112"base_uri": "https://localhost:8080/"
113},
114"id": "lhhy1qO5orHa",
115"outputId": "a99bdcfc-56e3-43c1-8926-562af2f19e44"
116},
117"outputs": [
118{
119"name": "stdout",
120"output_type": "stream",
121"text": [
122"bolt://localhost:7687\n",
123"neo4j\n",
124"docdb@123\n"
125]
126}
127],
128"source": [
129"# print(os.getenv('OPENAI_API_KEY'))\n",
130"print(os.getenv(\"NEO4J_URI\"))\n",
131"print(os.getenv(\"NEO4J_USERNAME\"))\n",
132"print(os.getenv('NEO4J_PASSWORD'))"
133]
134},
135{
136"cell_type": "markdown",
137"metadata": {
138"id": "QiTvOmA8rgeZ"
139},
140"source": [
141"#### Data Preprocessing"
142]
143},
144{
145"cell_type": "code",
146"execution_count": 4,
147"metadata": {
148"colab": {
149"base_uri": "https://localhost:8080/"
150},
151"id": "lUF9fXmvYO1h",
152"outputId": "a5b6c25c-5672-455d-e785-4c20148aa013"
153},
154"outputs": [
155{
156"name": "stdout",
157"output_type": "stream",
158"text": [
159"Number of tokens: 11\n"
160]
161}
162],
163"source": [
164"from transformers import AutoTokenizer\n",
165"\n",
166"# Define the tokenizer using \"bert-base-uncased\"\n",
167"tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
168"\n",
169"# Function to calculate the number of tokens in a text\n",
170"def bert_len(text):\n",
171" tokens = tokenizer.encode(text)\n",
172" return len(tokens)\n",
173"\n",
174"# Example usage\n",
175"input_text = \"This is a sample sentence for tokenization.\"\n",
176"num_tokens = bert_len(input_text)\n",
177"print(f\"Number of tokens: {num_tokens}\")"
178]
179},
180{
181"cell_type": "code",
182"execution_count": 5,
183"metadata": {},
184"outputs": [],
185"source": [
186"from langchain.document_loaders import PyPDFLoader\n",
187"\n",
188"loader = PyPDFLoader(\"./docs/YouCanHaveAnAmazingMemoryLearn.pdf\")\n",
189"pages = loader.load_and_split()"
190]
191},
192{
193"cell_type": "code",
194"execution_count": null,
195"metadata": {
196"colab": {
197"base_uri": "https://localhost:8080/"
198},
199"id": "YQqG0o7KbmqZ",
200"outputId": "7b8e48d5-df80-4d1a-d192-11ce439fc62a"
201},
202"outputs": [],
203"source": [
204"# # Load Wikipedia articles related to \"Leonhard Euler\"\n",
205"# raw_documents = WikipediaLoader(query=\"Sachin Tendulkar\").load()\n",
206"\n",
207"# # Define a text splitter with specific parameters\n",
208"# text_splitter = RecursiveCharacterTextSplitter(\n",
209"# chunk_size=200, chunk_overlap=20, length_function=bert_len, separators=['\\n\\n', '\\n', ' ', '']\n",
210"# )\n",
211"\n",
212"# # Split the content of the first Wikipedia article into smaller documents\n",
213"# documents = text_splitter.create_documents([raw_documents[0].page_content])"
214]
215},
216{
217"cell_type": "code",
218"execution_count": null,
219"metadata": {
220"colab": {
221"base_uri": "https://localhost:8080/"
222},
223"id": "c68Duv2Nbqqk",
224"outputId": "4510be6f-af0c-4c08-e814-da06bf75f7f2"
225},
226"outputs": [],
227"source": [
228"print(len(documents))"
229]
230},
231{
232"cell_type": "code",
233"execution_count": 6,
234"metadata": {},
235"outputs": [],
236"source": [
237"# Define a text splitter with specific parameters\n",
238"text_splitter = RecursiveCharacterTextSplitter(\n",
239" chunk_size=1000, chunk_overlap=200, length_function=bert_len, separators=['\\n\\n', '\\n', ' ', '']\n",
240")\n",
241"\n",
242"# Split the content of the first Wikipedia article into smaller documents\n",
243"documents = text_splitter.create_documents([pages[4].page_content])"
244]
245},
246{
247"cell_type": "markdown",
248"metadata": {
249"id": "X043ugczr0X5"
250},
251"source": [
252"#### Initializing Graph Database Neo4j [link text](https://)"
253]
254},
255{
256"cell_type": "code",
257"execution_count": 7,
258"metadata": {
259"id": "RSHWwlbJcCi2"
260},
261"outputs": [],
262"source": [
263"# Instantiate Neo4j vector from documents\n",
264"neo4j_vector = Neo4jVector.from_documents(\n",
265" documents,\n",
266" OpenAIEmbeddings(),\n",
267" url=os.environ[\"NEO4J_URI\"],\n",
268" username=os.environ[\"NEO4J_USERNAME\"],\n",
269" password=os.environ[\"NEO4J_PASSWORD\"]\n",
270")"
271]
272},
273{
274"cell_type": "markdown",
275"metadata": {
276"id": "34Fm9UgHwWdG"
277},
278"source": [
279"#### Peroforming Similarity Search on Ingested Documents"
280]
281},
282{
283"cell_type": "code",
284"execution_count": 8,
285"metadata": {
286"colab": {
287"base_uri": "https://localhost:8080/"
288},
289"id": "qp8bl2hjruzg",
290"outputId": "b5922d9e-7f16-4250-f917-9af03d361fa4"
291},
292"outputs": [
293{
294"name": "stdout",
295"output_type": "stream",
296"text": [
297"Chapter 26:\n",
298" Using the tools: Study and learning\n",
299"Chapter 27:\n",
300" Using the tools: Everyday ways to train your\n",
301"memory\n",
302"Chapter 28:\n",
303" Using the tools: Just for fun\n",
304"Chapter 29:\n",
305" Age equals experience, not forgetfulness!\n",
306"Chapter 30:\n",
307" I’ve done all that, now what can I expect?\n",
308"Chapter 31:\n",
309" Look at what you can do now!\n",
310" \n",
311"Afterword: The champions of the future\n",
312" \n",
313"Index\n",
314"\n",
315"Chapter 26:\n",
316" Using the tools: Study and learning\n",
317"Chapter 27:\n",
318" Using the tools: Everyday ways to train your\n",
319"memory\n",
320"Chapter 28:\n",
321" Using the tools: Just for fun\n",
322"Chapter 29:\n",
323" Age equals experience, not forgetfulness!\n",
324"Chapter 30:\n",
325" I’ve done all that, now what can I expect?\n",
326"Chapter 31:\n",
327" Look at what you can do now!\n",
328" \n",
329"Afterword: The champions of the future\n",
330" \n",
331"Index\n"
332]
333}
334],
335"source": [
336"# Define the query.\n",
337"query = \"What is the introduction on book?\"\n",
338"\n",
339"# Execute the query, get top 2 results.\n",
340"vector_results = neo4j_vector.similarity_search(query, k=2)\n",
341"\n",
342"# Print search results with separation.\n",
343"for i, res in enumerate(vector_results):\n",
344" print(res.page_content)\n",
345" if i != len(vector_results) - 1:\n",
346" print()\n",
347"\n",
348"# Store the content of the most similar result.\n",
349"vector_result = vector_results[0].page_content"
350]
351},
352{
353"cell_type": "markdown",
354"metadata": {
355"id": "yLCySJqcxV3W"
356},
357"source": [
358"#### Building Knowledge Graph"
359]
360},
361{
362"cell_type": "code",
363"execution_count": null,
364"metadata": {
365"id": "EOHIAZrLxh8N"
366},
367"outputs": [],
368"source": [
369"# Necessary Libraries to setup the Neo4j DB QuestionAnswering Chain\n",
370"from langchain.chat_models import ChatOpenAI\n",
371"from langchain.chains import GraphCypherQAChain\n",
372"from langchain.graphs import Neo4jGraph"
373]
374},
375{
376"cell_type": "code",
377"execution_count": null,
378"metadata": {
379"id": "N_sqke-SzfEQ"
380},
381"outputs": [],
382"source": [
383"# Create a Neo4jGraph object by connecting to a Neo4j database.\n",
384"graph = Neo4jGraph(\n",
385" url=\"bolt://localhost:7687\", username=\"neo4j\", password=\"docdb@123\"\n",
386")\n",
387"# from py2neo import Graph\n",
388"# graph = Graph(os.environ[\"NEO4J_URI\"],\n",
389"# auth = (os.environ[\"NEO4J_USERNAME\"], \n",
390"# os.environ[\"NEO4J_PASSWORD\"]))"
391]
392},
393{
394"cell_type": "code",
395"execution_count": null,
396"metadata": {
397"colab": {
398"base_uri": "https://localhost:8080/"
399},
400"id": "oHFGTHCEz_UD",
401"outputId": "eaf7ba8f-7188-4b5a-f4b3-f786f4a457ae"
402},
403"outputs": [],
404"source": [
405"# Print the schema of the Neo4j graph.\n",
406"print(graph.schema)"
407]
408},
409{
410"cell_type": "code",
411"execution_count": null,
412"metadata": {
413"id": "wN_9M9fi0OV8"
414},
415"outputs": [],
416"source": [
417"# Create a question-answering chain using GPT-3 and a Neo4j graph, with verbose mode enabled.\n",
418"chain = GraphCypherQAChain.from_llm(\n",
419" ChatOpenAI(temperature=0.9), graph=graph, verbose=True\n",
420")"
421]
422},
423{
424"cell_type": "code",
425"execution_count": null,
426"metadata": {
427"colab": {
428"base_uri": "https://localhost:8080/"
429},
430"id": "Iw0foDNr0c-v",
431"outputId": "8a776932-7768-4191-d2a4-37c2163b9d2e"
432},
433"outputs": [],
434"source": [
435"# Use the question-answering chain to query the Neo4j graph.\n",
436"graph_result = chain.run(\"What is the book about?\")"
437]
438},
439{
440"cell_type": "code",
441"execution_count": null,
442"metadata": {
443"colab": {
444"base_uri": "https://localhost:8080/",
445"height": 37
446},
447"id": "ljHsyK3z0pAf",
448"outputId": "7d95b141-ce27-4404-9e09-1f12c07f3ab8"
449},
450"outputs": [],
451"source": [
452"graph_result"
453]
454}
455],
456"metadata": {
457"colab": {
458"provenance": []
459},
460"kernelspec": {
461"display_name": "Python 3",
462"name": "python3"
463},
464"language_info": {
465"codemirror_mode": {
466"name": "ipython",
467"version": 3
468},
469"file_extension": ".py",
470"mimetype": "text/x-python",
471"name": "python",
472"nbconvert_exporter": "python",
473"pygments_lexer": "ipython3",
474"version": "3.9.18"
475}
476},
477"nbformat": 4,
478"nbformat_minor": 0
479}
480