milvus-io_bootcamp

Форк
0
/
multi_doc_qa_llamaindex.ipynb 
454 строки · 15.1 Кб
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 18,
6
   "metadata": {},
7
   "outputs": [],
8
   "source": [
9
    "! pip install llama-index nltk milvus pymilvus langchain openai python-dotenv requests"
10
   ]
11
  },
12
  {
13
   "cell_type": "code",
14
   "execution_count": 19,
15
   "metadata": {},
16
   "outputs": [
17
    {
18
     "name": "stderr",
19
     "output_type": "stream",
20
     "text": [
21
      "[nltk_data] Downloading package stopwords to\n",
22
      "[nltk_data]     /Users/yujiantang/nltk_data...\n",
23
      "[nltk_data]   Unzipping corpora/stopwords.zip.\n"
24
     ]
25
    },
26
    {
27
     "data": {
28
      "text/plain": [
29
       "True"
30
      ]
31
     },
32
     "execution_count": 19,
33
     "metadata": {},
34
     "output_type": "execute_result"
35
    }
36
   ],
37
   "source": [
38
    "import nltk\n",
39
    "import ssl\n",
40
    "\n",
41
    "try:\n",
42
    "    _create_unverified_https_context = ssl._create_unverified_context\n",
43
    "except AttributeError:\n",
44
    "    pass\n",
45
    "else:\n",
46
    "    ssl._create_default_https_context = _create_unverified_https_context\n",
47
    "\n",
48
    "nltk.download(\"stopwords\")"
49
   ]
50
  },
51
  {
52
   "cell_type": "code",
53
   "execution_count": 13,
54
   "metadata": {},
55
   "outputs": [],
56
   "source": [
57
    "from llama_index import (\n",
58
    "    GPTVectorStoreIndex, \n",
59
    "    GPTSimpleKeywordTableIndex, \n",
60
    "    SimpleDirectoryReader,\n",
61
    "    LLMPredictor,\n",
62
    "    ServiceContext,\n",
63
    "    StorageContext\n",
64
    ")\n",
65
    "from langchain.llms.openai import OpenAIChat"
66
   ]
67
  },
68
  {
69
   "cell_type": "code",
70
   "execution_count": 12,
71
   "metadata": {},
72
   "outputs": [
73
    {
74
     "name": "stdout",
75
     "output_type": "stream",
76
     "text": [
77
      "\n",
78
      "\n",
79
      "    __  _________ _   ____  ______\n",
80
      "   /  |/  /  _/ /| | / / / / / __/\n",
81
      "  / /|_/ // // /_| |/ / /_/ /\\ \\\n",
82
      " /_/  /_/___/____/___/\\____/___/ {Lite}\n",
83
      "\n",
84
      " Welcome to use Milvus!\n",
85
      "\n",
86
      " Version:   v2.2.8-lite\n",
87
      " Process:   49630\n",
88
      " Started:   2023-05-18 16:00:02\n",
89
      " Config:    /Users/yujiantang/.milvus.io/milvus-server/2.2.8/configs/milvus.yaml\n",
90
      " Logs:      /Users/yujiantang/.milvus.io/milvus-server/2.2.8/logs\n",
91
      "\n",
92
      " Ctrl+C to exit ...\n"
93
     ]
94
    },
95
    {
96
     "name": "stderr",
97
     "output_type": "stream",
98
     "text": [
99
      "\u001b[93m[has_collection] retry:4, cost: 0.27s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, internal: Milvus Proxy is not ready yet. please wait>\u001b[0m\n",
100
      "\u001b[93m[has_collection] retry:5, cost: 0.81s, reason: <_MultiThreadedRendezvous: StatusCode.UNAVAILABLE, internal: Milvus Proxy is not ready yet. please wait>\u001b[0m\n"
101
     ]
102
    }
103
   ],
104
   "source": [
105
    "import os\n",
106
    "from dotenv import load_dotenv\n",
107
    "import openai\n",
108
    "load_dotenv()\n",
109
    "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
110
    "\n",
111
    "from llama_index.vector_stores import MilvusVectorStore\n",
112
    "from milvus import default_server\n",
113
    "\n",
114
    "default_server.start()\n",
115
    "vector_store = MilvusVectorStore(\n",
116
    "   host = \"127.0.0.1\",\n",
117
    "   port = default_server.listen_port\n",
118
    ")\n"
119
   ]
120
  },
121
  {
122
   "cell_type": "code",
123
   "execution_count": 5,
124
   "metadata": {},
125
   "outputs": [],
126
   "source": [
127
    "wiki_titles = [\"Toronto\", \"Seattle\", \"San Francisco\", \"Chicago\", \"Boston\", \"Washington, D.C.\", \"Cambridge, Massachusetts\", \"Houston\"]"
128
   ]
129
  },
130
  {
131
   "cell_type": "code",
132
   "execution_count": 6,
133
   "metadata": {},
134
   "outputs": [],
135
   "source": [
136
    "from pathlib import Path\n",
137
    "\n",
138
    "import requests\n",
139
    "for title in wiki_titles:\n",
140
    "    response = requests.get(\n",
141
    "        'https://en.wikipedia.org/w/api.php',\n",
142
    "        params={\n",
143
    "            'action': 'query',\n",
144
    "            'format': 'json',\n",
145
    "            'titles': title,\n",
146
    "            'prop': 'extracts',\n",
147
    "            # 'exintro': True,\n",
148
    "            'explaintext': True,\n",
149
    "        }\n",
150
    "    ).json()\n",
151
    "    page = next(iter(response['query']['pages'].values()))\n",
152
    "    wiki_text = page['extract']\n",
153
    "\n",
154
    "    data_path = Path('data')\n",
155
    "    if not data_path.exists():\n",
156
    "        Path.mkdir(data_path)\n",
157
    "\n",
158
    "    with open(data_path / f\"{title}.txt\", 'w') as fp:\n",
159
    "        fp.write(wiki_text)"
160
   ]
161
  },
162
  {
163
   "cell_type": "code",
164
   "execution_count": 7,
165
   "metadata": {},
166
   "outputs": [],
167
   "source": [
168
    "# Load all wiki documents\n",
169
    "city_docs = {}\n",
170
    "for wiki_title in wiki_titles:\n",
171
    "    city_docs[wiki_title] = SimpleDirectoryReader(input_files=[f\"data/{wiki_title}.txt\"]).load_data()"
172
   ]
173
  },
174
  {
175
   "cell_type": "code",
176
   "execution_count": 10,
177
   "metadata": {},
178
   "outputs": [
179
    {
180
     "name": "stderr",
181
     "output_type": "stream",
182
     "text": [
183
      "/Users/yujiantang/Documents/workspace/hello_world_project/hw_milvus/lib/python3.10/site-packages/langchain/llms/openai.py:696: UserWarning: You are trying to use a chat model. This way of initializing it is no longer supported. Instead, please use: `from langchain.chat_models import ChatOpenAI`\n",
184
      "  warnings.warn(\n"
185
     ]
186
    }
187
   ],
188
   "source": [
189
    "llm_predictor_chatgpt = LLMPredictor(llm=OpenAIChat(temperature=0, model_name=\"gpt-3.5-turbo\"))\n",
190
    "service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor_chatgpt)"
191
   ]
192
  },
193
  {
194
   "cell_type": "code",
195
   "execution_count": 14,
196
   "metadata": {},
197
   "outputs": [],
198
   "source": [
199
    "storage_context = StorageContext.from_defaults(vector_store=vector_store)"
200
   ]
201
  },
202
  {
203
   "cell_type": "code",
204
   "execution_count": 15,
205
   "metadata": {},
206
   "outputs": [],
207
   "source": [
208
    "# Build city document index\n",
209
    "city_indices = {}\n",
210
    "index_summaries = {}\n",
211
    "for wiki_title in wiki_titles:\n",
212
    "    city_indices[wiki_title] = GPTVectorStoreIndex.from_documents(city_docs[wiki_title], service_context=service_context, storage_context=storage_context)\n",
213
    "    # set summary text for city\n",
214
    "    index_summaries[wiki_title] = f\"Wikipedia articles about {wiki_title}\""
215
   ]
216
  },
217
  {
218
   "cell_type": "code",
219
   "execution_count": 16,
220
   "metadata": {},
221
   "outputs": [],
222
   "source": [
223
    "from llama_index.indices.composability import ComposableGraph"
224
   ]
225
  },
226
  {
227
   "cell_type": "code",
228
   "execution_count": 20,
229
   "metadata": {},
230
   "outputs": [],
231
   "source": [
232
    "graph = ComposableGraph.from_indices(\n",
233
    "    GPTSimpleKeywordTableIndex,\n",
234
    "    [index for _, index in city_indices.items()], \n",
235
    "    [summary for _, summary in index_summaries.items()],\n",
236
    "    max_keywords_per_chunk=50\n",
237
    ")"
238
   ]
239
  },
240
  {
241
   "cell_type": "code",
242
   "execution_count": 21,
243
   "metadata": {},
244
   "outputs": [],
245
   "source": [
246
    "from llama_index.indices.query.query_transform.base import DecomposeQueryTransform\n",
247
    "decompose_transform = DecomposeQueryTransform(\n",
248
    "    llm_predictor_chatgpt, verbose=True\n",
249
    ")"
250
   ]
251
  },
252
  {
253
   "cell_type": "code",
254
   "execution_count": 23,
255
   "metadata": {},
256
   "outputs": [],
257
   "source": [
258
    "from llama_index.query_engine.transform_query_engine import TransformQueryEngine\n",
259
    "custom_query_engines = {}"
260
   ]
261
  },
262
  {
263
   "cell_type": "code",
264
   "execution_count": 25,
265
   "metadata": {},
266
   "outputs": [],
267
   "source": [
268
    "for index in city_indices.values():\n",
269
    "    query_engine = index.as_query_engine(service_context=service_context)\n",
270
    "    transform_extra_info = {'index_summary': index.index_struct.summary}\n",
271
    "    tranformed_query_engine = TransformQueryEngine(query_engine, decompose_transform, transform_extra_info=transform_extra_info)\n",
272
    "    custom_query_engines[index.index_id] = tranformed_query_engine\n",
273
    "\n",
274
    "custom_query_engines[graph.root_index.index_id] = graph.root_index.as_query_engine(\n",
275
    "    retriever_mode='simple', \n",
276
    "    response_mode='tree_summarize', \n",
277
    "    service_context=service_context\n",
278
    ")\n",
279
    "\n",
280
    "query_engine_decompose = graph.as_query_engine(\n",
281
    "    custom_query_engines=custom_query_engines,)"
282
   ]
283
  },
284
  {
285
   "cell_type": "code",
286
   "execution_count": 26,
287
   "metadata": {},
288
   "outputs": [
289
    {
290
     "name": "stdout",
291
     "output_type": "stream",
292
     "text": [
293
      "\u001b[33;1m\u001b[1;3m> Current query: Compare and contrast the airports in Seattle, Houston, and Toronto. \n",
294
      "\u001b[0m\u001b[38;5;200m\u001b[1;3m> New query: What is the name of the airport in Seattle?\n",
295
      "\u001b[0m\u001b[33;1m\u001b[1;3m> Current query: Compare and contrast the airports in Seattle, Houston, and Toronto. \n",
296
      "\u001b[0m\u001b[38;5;200m\u001b[1;3m> New query: What is the name of the airport in Seattle?\n",
297
      "\u001b[0m\u001b[33;1m\u001b[1;3m> Current query: Compare and contrast the airports in Seattle, Houston, and Toronto. \n",
298
      "\u001b[0m\u001b[38;5;200m\u001b[1;3m> New query: What are the major airports in Houston?\n",
299
      "\u001b[0m\u001b[33;1m\u001b[1;3m> Current query: Compare and contrast the airports in Seattle, Houston, and Toronto. \n",
300
      "\u001b[0m\u001b[38;5;200m\u001b[1;3m> New query: What are the major airports in Houston?\n",
301
      "\u001b[0m\u001b[33;1m\u001b[1;3m> Current query: Compare and contrast the airports in Seattle, Houston, and Toronto. \n",
302
      "\u001b[0m\u001b[38;5;200m\u001b[1;3m> New query: What are some notable features of the Toronto airport?\n",
303
      "\u001b[0m\u001b[33;1m\u001b[1;3m> Current query: Compare and contrast the airports in Seattle, Houston, and Toronto. \n",
304
      "\u001b[0m\u001b[38;5;200m\u001b[1;3m> New query: What are some notable features of the Toronto Pearson International Airport?\n",
305
      "\u001b[0m"
306
     ]
307
    }
308
   ],
309
   "source": [
310
    "response_chatgpt = query_engine_decompose.query(\n",
311
    "    \"Compare and contrast the airports in Seattle, Houston, and Toronto. \"\n",
312
    ")"
313
   ]
314
  },
315
  {
316
   "cell_type": "code",
317
   "execution_count": null,
318
   "metadata": {},
319
   "outputs": [
320
    {
321
     "name": "stdout",
322
     "output_type": "stream",
323
     "text": [
324
      "Seattle has one major airport called Seattle-Tacoma International Airport, while Houston has two major airports called George Bush Intercontinental Airport and William P. Hobby Airport, as well as a third municipal airport called Ellington Airport. Toronto's busiest airport is called Toronto Pearson International Airport and is located on the city's western boundary with Mississauga. It offers limited commercial and passenger service to nearby destinations in Canada and the United States. Seattle-Tacoma International Airport and George Bush Intercontinental Airport are both major international airports, while William P. Hobby Airport and Ellington Airport are smaller and serve more regional destinations. Toronto Pearson International Airport is Canada's busiest airport and offers a direct link to Union Station through the Union Pearson Express train service.\n"
325
     ]
326
    }
327
   ],
328
   "source": [
329
    "print(str(response_chatgpt))"
330
   ]
331
  },
332
  {
333
   "cell_type": "code",
334
   "execution_count": 28,
335
   "metadata": {},
336
   "outputs": [],
337
   "source": [
338
    "custom_query_engines = {}\n",
339
    "for index in city_indices.values():\n",
340
    "    query_engine = index.as_query_engine(service_context=service_context)\n",
341
    "    custom_query_engines[index.index_id] = query_engine\n",
342
    "\n",
343
    "custom_query_engines[graph.root_index.index_id] = graph.root_index.as_query_engine(\n",
344
    "    retriever_mode='simple', \n",
345
    "    response_mode='tree_summarize', \n",
346
    "    service_context=service_context\n",
347
    ")\n",
348
    "\n",
349
    "query_engine = graph.as_query_engine(\n",
350
    "    custom_query_engines=custom_query_engines,    \n",
351
    ")"
352
   ]
353
  },
354
  {
355
   "cell_type": "code",
356
   "execution_count": 29,
357
   "metadata": {},
358
   "outputs": [
359
    {
360
     "data": {
361
      "text/plain": [
362
       "'The context information provided does not contain enough information to answer the question.'"
363
      ]
364
     },
365
     "execution_count": 29,
366
     "metadata": {},
367
     "output_type": "execute_result"
368
    }
369
   ],
370
   "source": [
371
    "response_chatgpt = query_engine.query(\n",
372
    "    \"Compare and contrast the airports in Seattle, Houston, and Toronto. \"\n",
373
    ")\n",
374
    "str(response_chatgpt)"
375
   ]
376
  },
377
  {
378
   "cell_type": "code",
379
   "execution_count": 30,
380
   "metadata": {},
381
   "outputs": [
382
    {
383
     "name": "stdout",
384
     "output_type": "stream",
385
     "text": [
386
      "\u001b[33;1m\u001b[1;3m> Current query: Compare and contrast the sports environment of Houston and Boston. \n",
387
      "\u001b[0m\u001b[38;5;200m\u001b[1;3m> New query: What sports teams are based in Houston?\n",
388
      "\u001b[0m\u001b[33;1m\u001b[1;3m> Current query: Compare and contrast the sports environment of Houston and Boston. \n",
389
      "\u001b[0m\u001b[38;5;200m\u001b[1;3m> New query: What sports teams are based in Houston?\n",
390
      "\u001b[0m\u001b[33;1m\u001b[1;3m> Current query: Compare and contrast the sports environment of Houston and Boston. \n",
391
      "\u001b[0m\u001b[38;5;200m\u001b[1;3m> New query: What are some notable sports teams based in Boston?\n",
392
      "\u001b[0m\u001b[33;1m\u001b[1;3m> Current query: Compare and contrast the sports environment of Houston and Boston. \n",
393
      "\u001b[0m\u001b[38;5;200m\u001b[1;3m> New query: What are some notable sports teams based in Boston?\n",
394
      "\u001b[0m"
395
     ]
396
    }
397
   ],
398
   "source": [
399
    "response_chatgpt = query_engine_decompose.query(\n",
400
    "    \"Compare and contrast the sports environment of Houston and Boston. \"\n",
401
    ")"
402
   ]
403
  },
404
  {
405
   "cell_type": "code",
406
   "execution_count": 31,
407
   "metadata": {},
408
   "outputs": [
409
    {
410
     "data": {
411
      "text/plain": [
412
       "\"Houston has sports teams for every major professional league except the National Hockey League, while Boston has teams for the NFL, MLB, NBA, and NHL. Both cities have professional soccer teams, with Houston having a Major League Soccer franchise and Boston having a National Women's Soccer League team. Boston also has several college sports teams and Esports teams, while Houston does not have any notable college sports teams or Esports teams. Both cities are known for hosting major sporting events, with Boston hosting the Boston Marathon and the Head of the Charles Regatta, while Houston does not have any comparable events. Overall, Boston has a more diverse sports environment with teams in all major professional leagues and a strong presence in college sports and Esports.\""
413
      ]
414
     },
415
     "execution_count": 31,
416
     "metadata": {},
417
     "output_type": "execute_result"
418
    }
419
   ],
420
   "source": [
421
    "str(response_chatgpt)"
422
   ]
423
  },
424
  {
425
   "cell_type": "code",
426
   "execution_count": null,
427
   "metadata": {},
428
   "outputs": [],
429
   "source": []
430
  }
431
 ],
432
 "metadata": {
433
  "kernelspec": {
434
   "display_name": "hw_milvus",
435
   "language": "python",
436
   "name": "python3"
437
  },
438
  "language_info": {
439
   "codemirror_mode": {
440
    "name": "ipython",
441
    "version": 3
442
   },
443
   "file_extension": ".py",
444
   "mimetype": "text/x-python",
445
   "name": "python",
446
   "nbconvert_exporter": "python",
447
   "pygments_lexer": "ipython3",
448
   "version": "3.10.11"
449
  },
450
  "orig_nbformat": 4
451
 },
452
 "nbformat": 4,
453
 "nbformat_minor": 2
454
}
455

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.