llama-index

huggingface.py
636 строк · 23.6 Кб
Перенос по словам
1
import logging
2
from threading import Thread
3
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Sequence, Union
4

5
from llama_index.legacy.bridge.pydantic import Field, PrivateAttr
6
from llama_index.legacy.callbacks import CallbackManager
7
from llama_index.legacy.constants import (
8
    DEFAULT_CONTEXT_WINDOW,
9
    DEFAULT_NUM_OUTPUTS,
10
)
11
from llama_index.legacy.core.llms.types import (
12
    ChatMessage,
13
    ChatResponse,
14
    ChatResponseAsyncGen,
15
    ChatResponseGen,
16
    CompletionResponse,
17
    CompletionResponseAsyncGen,
18
    CompletionResponseGen,
19
    LLMMetadata,
20
    MessageRole,
21
)
22
from llama_index.legacy.llms.base import (
23
    llm_chat_callback,
24
    llm_completion_callback,
25
)
26
from llama_index.legacy.llms.custom import CustomLLM
27
from llama_index.legacy.llms.generic_utils import (
28
    completion_response_to_chat_response,
29
    stream_completion_response_to_chat_response,
30
)
31
from llama_index.legacy.llms.generic_utils import (
32
    messages_to_prompt as generic_messages_to_prompt,
33
)
34
from llama_index.legacy.prompts.base import PromptTemplate
35
from llama_index.legacy.types import BaseOutputParser, PydanticProgramMode
36

37
DEFAULT_HUGGINGFACE_MODEL = "StabilityAI/stablelm-tuned-alpha-3b"
38
if TYPE_CHECKING:
39
    try:
40
        from huggingface_hub import AsyncInferenceClient, InferenceClient
41
        from huggingface_hub.hf_api import ModelInfo
42
        from huggingface_hub.inference._types import ConversationalOutput
43
    except ModuleNotFoundError:
44
        AsyncInferenceClient = Any
45
        InferenceClient = Any
46
        ConversationalOutput = dict
47
        ModelInfo = Any
48

49
logger = logging.getLogger(__name__)
50

51

52
class HuggingFaceLLM(CustomLLM):
53
    """HuggingFace LLM."""
54

55
    model_name: str = Field(
56
        default=DEFAULT_HUGGINGFACE_MODEL,
57
        description=(
58
            "The model name to use from HuggingFace. "
59
            "Unused if `model` is passed in directly."
60
        ),
61
    )
62
    context_window: int = Field(
63
        default=DEFAULT_CONTEXT_WINDOW,
64
        description="The maximum number of tokens available for input.",
65
        gt=0,
66
    )
67
    max_new_tokens: int = Field(
68
        default=DEFAULT_NUM_OUTPUTS,
69
        description="The maximum number of tokens to generate.",
70
        gt=0,
71
    )
72
    system_prompt: str = Field(
73
        default="",
74
        description=(
75
            "The system prompt, containing any extra instructions or context. "
76
            "The model card on HuggingFace should specify if this is needed."
77
        ),
78
    )
79
    query_wrapper_prompt: PromptTemplate = Field(
80
        default=PromptTemplate("{query_str}"),
81
        description=(
82
            "The query wrapper prompt, containing the query placeholder. "
83
            "The model card on HuggingFace should specify if this is needed. "
84
            "Should contain a `{query_str}` placeholder."
85
        ),
86
    )
87
    tokenizer_name: str = Field(
88
        default=DEFAULT_HUGGINGFACE_MODEL,
89
        description=(
90
            "The name of the tokenizer to use from HuggingFace. "
91
            "Unused if `tokenizer` is passed in directly."
92
        ),
93
    )
94
    device_map: str = Field(
95
        default="auto", description="The device_map to use. Defaults to 'auto'."
96
    )
97
    stopping_ids: List[int] = Field(
98
        default_factory=list,
99
        description=(
100
            "The stopping ids to use. "
101
            "Generation stops when these token IDs are predicted."
102
        ),
103
    )
104
    tokenizer_outputs_to_remove: list = Field(
105
        default_factory=list,
106
        description=(
107
            "The outputs to remove from the tokenizer. "
108
            "Sometimes huggingface tokenizers return extra inputs that cause errors."
109
        ),
110
    )
111
    tokenizer_kwargs: dict = Field(
112
        default_factory=dict, description="The kwargs to pass to the tokenizer."
113
    )
114
    model_kwargs: dict = Field(
115
        default_factory=dict,
116
        description="The kwargs to pass to the model during initialization.",
117
    )
118
    generate_kwargs: dict = Field(
119
        default_factory=dict,
120
        description="The kwargs to pass to the model during generation.",
121
    )
122
    is_chat_model: bool = Field(
123
        default=False,
124
        description=(
125
            LLMMetadata.__fields__["is_chat_model"].field_info.description
126
            + " Be sure to verify that you either pass an appropriate tokenizer "
127
            "that can convert prompts to properly formatted chat messages or a "
128
            "`messages_to_prompt` that does so."
129
        ),
130
    )
131

132
    _model: Any = PrivateAttr()
133
    _tokenizer: Any = PrivateAttr()
134
    _stopping_criteria: Any = PrivateAttr()
135

136
    def __init__(
137
        self,
138
        context_window: int = DEFAULT_CONTEXT_WINDOW,
139
        max_new_tokens: int = DEFAULT_NUM_OUTPUTS,
140
        query_wrapper_prompt: Union[str, PromptTemplate] = "{query_str}",
141
        tokenizer_name: str = DEFAULT_HUGGINGFACE_MODEL,
142
        model_name: str = DEFAULT_HUGGINGFACE_MODEL,
143
        model: Optional[Any] = None,
144
        tokenizer: Optional[Any] = None,
145
        device_map: Optional[str] = "auto",
146
        stopping_ids: Optional[List[int]] = None,
147
        tokenizer_kwargs: Optional[dict] = None,
148
        tokenizer_outputs_to_remove: Optional[list] = None,
149
        model_kwargs: Optional[dict] = None,
150
        generate_kwargs: Optional[dict] = None,
151
        is_chat_model: Optional[bool] = False,
152
        callback_manager: Optional[CallbackManager] = None,
153
        system_prompt: str = "",
154
        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
155
        completion_to_prompt: Optional[Callable[[str], str]] = None,
156
        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
157
        output_parser: Optional[BaseOutputParser] = None,
158
    ) -> None:
159
        """Initialize params."""
160
        try:
161
            import torch
162
            from transformers import (
163
                AutoModelForCausalLM,
164
                AutoTokenizer,
165
                StoppingCriteria,
166
                StoppingCriteriaList,
167
            )
168
        except ImportError as exc:
169
            raise ImportError(
170
                f"{type(self).__name__} requires torch and transformers packages.\n"
171
                "Please install both with `pip install transformers[torch]`."
172
            ) from exc
173

174
        model_kwargs = model_kwargs or {}
175
        self._model = model or AutoModelForCausalLM.from_pretrained(
176
            model_name, device_map=device_map, **model_kwargs
177
        )
178

179
        # check context_window
180
        config_dict = self._model.config.to_dict()
181
        model_context_window = int(
182
            config_dict.get("max_position_embeddings", context_window)
183
        )
184
        if model_context_window and model_context_window < context_window:
185
            logger.warning(
186
                f"Supplied context_window {context_window} is greater "
187
                f"than the model's max input size {model_context_window}. "
188
                "Disable this warning by setting a lower context_window."
189
            )
190
            context_window = model_context_window
191

192
        tokenizer_kwargs = tokenizer_kwargs or {}
193
        if "max_length" not in tokenizer_kwargs:
194
            tokenizer_kwargs["max_length"] = context_window
195

196
        self._tokenizer = tokenizer or AutoTokenizer.from_pretrained(
197
            tokenizer_name, **tokenizer_kwargs
198
        )
199

200
        if tokenizer_name != model_name:
201
            logger.warning(
202
                f"The model `{model_name}` and tokenizer `{tokenizer_name}` "
203
                f"are different, please ensure that they are compatible."
204
            )
205

206
        # setup stopping criteria
207
        stopping_ids_list = stopping_ids or []
208

209
        class StopOnTokens(StoppingCriteria):
210
            def __call__(
211
                self,
212
                input_ids: torch.LongTensor,
213
                scores: torch.FloatTensor,
214
                **kwargs: Any,
215
            ) -> bool:
216
                for stop_id in stopping_ids_list:
217
                    if input_ids[0][-1] == stop_id:
218
                        return True
219
                return False
220

221
        self._stopping_criteria = StoppingCriteriaList([StopOnTokens()])
222

223
        if isinstance(query_wrapper_prompt, str):
224
            query_wrapper_prompt = PromptTemplate(query_wrapper_prompt)
225

226
        messages_to_prompt = messages_to_prompt or self._tokenizer_messages_to_prompt
227

228
        super().__init__(
229
            context_window=context_window,
230
            max_new_tokens=max_new_tokens,
231
            query_wrapper_prompt=query_wrapper_prompt,
232
            tokenizer_name=tokenizer_name,
233
            model_name=model_name,
234
            device_map=device_map,
235
            stopping_ids=stopping_ids or [],
236
            tokenizer_kwargs=tokenizer_kwargs or {},
237
            tokenizer_outputs_to_remove=tokenizer_outputs_to_remove or [],
238
            model_kwargs=model_kwargs or {},
239
            generate_kwargs=generate_kwargs or {},
240
            is_chat_model=is_chat_model,
241
            callback_manager=callback_manager,
242
            system_prompt=system_prompt,
243
            messages_to_prompt=messages_to_prompt,
244
            completion_to_prompt=completion_to_prompt,
245
            pydantic_program_mode=pydantic_program_mode,
246
            output_parser=output_parser,
247
        )
248

249
    @classmethod
250
    def class_name(cls) -> str:
251
        return "HuggingFace_LLM"
252

253
    @property
254
    def metadata(self) -> LLMMetadata:
255
        """LLM metadata."""
256
        return LLMMetadata(
257
            context_window=self.context_window,
258
            num_output=self.max_new_tokens,
259
            model_name=self.model_name,
260
            is_chat_model=self.is_chat_model,
261
        )
262

263
    def _tokenizer_messages_to_prompt(self, messages: Sequence[ChatMessage]) -> str:
264
        """Use the tokenizer to convert messages to prompt. Fallback to generic."""
265
        if hasattr(self._tokenizer, "apply_chat_template"):
266
            messages_dict = [
267
                {"role": message.role.value, "content": message.content}
268
                for message in messages
269
            ]
270
            tokens = self._tokenizer.apply_chat_template(messages_dict)
271
            return self._tokenizer.decode(tokens)
272

273
        return generic_messages_to_prompt(messages)
274

275
    @llm_completion_callback()
276
    def complete(
277
        self, prompt: str, formatted: bool = False, **kwargs: Any
278
    ) -> CompletionResponse:
279
        """Completion endpoint."""
280
        full_prompt = prompt
281
        if not formatted:
282
            if self.query_wrapper_prompt:
283
                full_prompt = self.query_wrapper_prompt.format(query_str=prompt)
284
            if self.system_prompt:
285
                full_prompt = f"{self.system_prompt} {full_prompt}"
286

287
        inputs = self._tokenizer(full_prompt, return_tensors="pt")
288
        inputs = inputs.to(self._model.device)
289

290
        # remove keys from the tokenizer if needed, to avoid HF errors
291
        for key in self.tokenizer_outputs_to_remove:
292
            if key in inputs:
293
                inputs.pop(key, None)
294

295
        tokens = self._model.generate(
296
            **inputs,
297
            max_new_tokens=self.max_new_tokens,
298
            stopping_criteria=self._stopping_criteria,
299
            **self.generate_kwargs,
300
        )
301
        completion_tokens = tokens[0][inputs["input_ids"].size(1) :]
302
        completion = self._tokenizer.decode(completion_tokens, skip_special_tokens=True)
303

304
        return CompletionResponse(text=completion, raw={"model_output": tokens})
305

306
    @llm_completion_callback()
307
    def stream_complete(
308
        self, prompt: str, formatted: bool = False, **kwargs: Any
309
    ) -> CompletionResponseGen:
310
        """Streaming completion endpoint."""
311
        from transformers import TextIteratorStreamer
312

313
        full_prompt = prompt
314
        if not formatted:
315
            if self.query_wrapper_prompt:
316
                full_prompt = self.query_wrapper_prompt.format(query_str=prompt)
317
            if self.system_prompt:
318
                full_prompt = f"{self.system_prompt} {full_prompt}"
319

320
        inputs = self._tokenizer(full_prompt, return_tensors="pt")
321
        inputs = inputs.to(self._model.device)
322

323
        # remove keys from the tokenizer if needed, to avoid HF errors
324
        for key in self.tokenizer_outputs_to_remove:
325
            if key in inputs:
326
                inputs.pop(key, None)
327

328
        streamer = TextIteratorStreamer(
329
            self._tokenizer,
330
            skip_prompt=True,
331
            decode_kwargs={"skip_special_tokens": True},
332
        )
333
        generation_kwargs = dict(
334
            inputs,
335
            streamer=streamer,
336
            max_new_tokens=self.max_new_tokens,
337
            stopping_criteria=self._stopping_criteria,
338
            **self.generate_kwargs,
339
        )
340

341
        # generate in background thread
342
        # NOTE/TODO: token counting doesn't work with streaming
343
        thread = Thread(target=self._model.generate, kwargs=generation_kwargs)
344
        thread.start()
345

346
        # create generator based off of streamer
347
        def gen() -> CompletionResponseGen:
348
            text = ""
349
            for x in streamer:
350
                text += x
351
                yield CompletionResponse(text=text, delta=x)
352

353
        return gen()
354

355
    @llm_chat_callback()
356
    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
357
        prompt = self.messages_to_prompt(messages)
358
        completion_response = self.complete(prompt, formatted=True, **kwargs)
359
        return completion_response_to_chat_response(completion_response)
360

361
    @llm_chat_callback()
362
    def stream_chat(
363
        self, messages: Sequence[ChatMessage], **kwargs: Any
364
    ) -> ChatResponseGen:
365
        prompt = self.messages_to_prompt(messages)
366
        completion_response = self.stream_complete(prompt, formatted=True, **kwargs)
367
        return stream_completion_response_to_chat_response(completion_response)
368

369

370
def chat_messages_to_conversational_kwargs(
371
    messages: Sequence[ChatMessage],
372
) -> Dict[str, Any]:
373
    """Convert ChatMessages to keyword arguments for Inference API conversational."""
374
    if len(messages) % 2 != 1:
375
        raise NotImplementedError("Messages passed in must be of odd length.")
376
    last_message = messages[-1]
377
    kwargs: Dict[str, Any] = {
378
        "text": last_message.content,
379
        **last_message.additional_kwargs,
380
    }
381
    if len(messages) != 1:
382
        kwargs["past_user_inputs"] = []
383
        kwargs["generated_responses"] = []
384
        for user_msg, assistant_msg in zip(messages[::2], messages[1::2]):
385
            if (
386
                user_msg.role != MessageRole.USER
387
                or assistant_msg.role != MessageRole.ASSISTANT
388
            ):
389
                raise NotImplementedError(
390
                    "Didn't handle when messages aren't ordered in alternating"
391
                    f" pairs of {(MessageRole.USER, MessageRole.ASSISTANT)}."
392
                )
393
            kwargs["past_user_inputs"].append(user_msg.content)
394
            kwargs["generated_responses"].append(assistant_msg.content)
395
    return kwargs
396

397

398
class HuggingFaceInferenceAPI(CustomLLM):
399
    """
400
    Wrapper on the Hugging Face's Inference API.
401

402
    Overview of the design:
403
    - Synchronous uses InferenceClient, asynchronous uses AsyncInferenceClient
404
    - chat uses the conversational task: https://huggingface.co/tasks/conversational
405
    - complete uses the text generation task: https://huggingface.co/tasks/text-generation
406

407
    Note: some models that support the text generation task can leverage Hugging
408
    Face's optimized deployment toolkit called text-generation-inference (TGI).
409
    Use InferenceClient.get_model_status to check if TGI is being used.
410

411
    Relevant links:
412
    - General Docs: https://huggingface.co/docs/api-inference/index
413
    - API Docs: https://huggingface.co/docs/huggingface_hub/main/en/package_reference/inference_client
414
    - Source: https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub/inference
415
    """
416

417
    @classmethod
418
    def class_name(cls) -> str:
419
        return "HuggingFaceInferenceAPI"
420

421
    # Corresponds with huggingface_hub.InferenceClient
422
    model_name: Optional[str] = Field(
423
        default=None,
424
        description=(
425
            "The model to run inference with. Can be a model id hosted on the Hugging"
426
            " Face Hub, e.g. bigcode/starcoder or a URL to a deployed Inference"
427
            " Endpoint. Defaults to None, in which case a recommended model is"
428
            " automatically selected for the task (see Field below)."
429
        ),
430
    )
431
    token: Union[str, bool, None] = Field(
432
        default=None,
433
        description=(
434
            "Hugging Face token. Will default to the locally saved token. Pass "
435
            "token=False if you don’t want to send your token to the server."
436
        ),
437
    )
438
    timeout: Optional[float] = Field(
439
        default=None,
440
        description=(
441
            "The maximum number of seconds to wait for a response from the server."
442
            " Loading a new model in Inference API can take up to several minutes."
443
            " Defaults to None, meaning it will loop until the server is available."
444
        ),
445
    )
446
    headers: Dict[str, str] = Field(
447
        default=None,
448
        description=(
449
            "Additional headers to send to the server. By default only the"
450
            " authorization and user-agent headers are sent. Values in this dictionary"
451
            " will override the default values."
452
        ),
453
    )
454
    cookies: Dict[str, str] = Field(
455
        default=None, description="Additional cookies to send to the server."
456
    )
457
    task: Optional[str] = Field(
458
        default=None,
459
        description=(
460
            "Optional task to pick Hugging Face's recommended model, used when"
461
            " model_name is left as default of None."
462
        ),
463
    )
464

465
    _sync_client: "InferenceClient" = PrivateAttr()
466
    _async_client: "AsyncInferenceClient" = PrivateAttr()
467
    _get_model_info: "Callable[..., ModelInfo]" = PrivateAttr()
468

469
    context_window: int = Field(
470
        default=DEFAULT_CONTEXT_WINDOW,
471
        description=(
472
            LLMMetadata.__fields__["context_window"].field_info.description
473
            + " This may be looked up in a model's `config.json`."
474
        ),
475
    )
476
    num_output: int = Field(
477
        default=DEFAULT_NUM_OUTPUTS,
478
        description=LLMMetadata.__fields__["num_output"].field_info.description,
479
    )
480
    is_chat_model: bool = Field(
481
        default=False,
482
        description=(
483
            LLMMetadata.__fields__["is_chat_model"].field_info.description
484
            + " Unless chat templating is intentionally applied, Hugging Face models"
485
            " are not chat models."
486
        ),
487
    )
488
    is_function_calling_model: bool = Field(
489
        default=False,
490
        description=(
491
            LLMMetadata.__fields__["is_function_calling_model"].field_info.description
492
            + " As of 10/17/2023, Hugging Face doesn't support function calling"
493
            " messages."
494
        ),
495
    )
496

497
    def _get_inference_client_kwargs(self) -> Dict[str, Any]:
498
        """Extract the Hugging Face InferenceClient construction parameters."""
499
        return {
500
            "model": self.model_name,
501
            "token": self.token,
502
            "timeout": self.timeout,
503
            "headers": self.headers,
504
            "cookies": self.cookies,
505
        }
506

507
    def __init__(self, **kwargs: Any) -> None:
508
        """Initialize.
509

510
        Args:
511
            kwargs: See the class-level Fields.
512
        """
513
        try:
514
            from huggingface_hub import (
515
                AsyncInferenceClient,
516
                InferenceClient,
517
                model_info,
518
            )
519
        except ModuleNotFoundError as exc:
520
            raise ImportError(
521
                f"{type(self).__name__} requires huggingface_hub with its inference"
522
                " extra, please run `pip install huggingface_hub[inference]>=0.19.0`."
523
            ) from exc
524
        if kwargs.get("model_name") is None:
525
            task = kwargs.get("task", "")
526
            # NOTE: task being None or empty string leads to ValueError,
527
            # which ensures model is present
528
            kwargs["model_name"] = InferenceClient.get_recommended_model(task=task)
529
            logger.debug(
530
                f"Using Hugging Face's recommended model {kwargs['model_name']}"
531
                f" given task {task}."
532
            )
533
        if kwargs.get("task") is None:
534
            task = "conversational"
535
        else:
536
            task = kwargs["task"].lower()
537

538
        super().__init__(**kwargs)  # Populate pydantic Fields
539
        self._sync_client = InferenceClient(**self._get_inference_client_kwargs())
540
        self._async_client = AsyncInferenceClient(**self._get_inference_client_kwargs())
541
        self._get_model_info = model_info
542

543
    def validate_supported(self, task: str) -> None:
544
        """
545
        Confirm the contained model_name is deployed on the Inference API service.
546

547
        Args:
548
            task: Hugging Face task to check within. A list of all tasks can be
549
                found here: https://huggingface.co/tasks
550
        """
551
        all_models = self._sync_client.list_deployed_models(frameworks="all")
552
        try:
553
            if self.model_name not in all_models[task]:
554
                raise ValueError(
555
                    "The Inference API service doesn't have the model"
556
                    f" {self.model_name!r} deployed."
557
                )
558
        except KeyError as exc:
559
            raise KeyError(
560
                f"Input task {task!r} not in possible tasks {list(all_models.keys())}."
561
            ) from exc
562

563
    def get_model_info(self, **kwargs: Any) -> "ModelInfo":
564
        """Get metadata on the current model from Hugging Face."""
565
        return self._get_model_info(self.model_name, **kwargs)
566

567
    @property
568
    def metadata(self) -> LLMMetadata:
569
        return LLMMetadata(
570
            context_window=self.context_window,
571
            num_output=self.num_output,
572
            is_chat_model=self.is_chat_model,
573
            is_function_calling_model=self.is_function_calling_model,
574
            model_name=self.model_name,
575
        )
576

577
    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
578
        # default to conversational task as that was the previous functionality
579
        if self.task == "conversational" or self.task is None:
580
            output: "ConversationalOutput" = self._sync_client.conversational(
581
                **{**chat_messages_to_conversational_kwargs(messages), **kwargs}
582
            )
583
            return ChatResponse(
584
                message=ChatMessage(
585
                    role=MessageRole.ASSISTANT, content=output["generated_text"]
586
                )
587
            )
588
        else:
589
            # try and use text generation
590
            prompt = self.messages_to_prompt(messages)
591
            completion = self.complete(prompt)
592
            return ChatResponse(
593
                message=ChatMessage(role=MessageRole.ASSISTANT, content=completion.text)
594
            )
595

596
    def complete(
597
        self, prompt: str, formatted: bool = False, **kwargs: Any
598
    ) -> CompletionResponse:
599
        return CompletionResponse(
600
            text=self._sync_client.text_generation(
601
                prompt, **{**{"max_new_tokens": self.num_output}, **kwargs}
602
            )
603
        )
604

605
    def stream_chat(
606
        self, messages: Sequence[ChatMessage], **kwargs: Any
607
    ) -> ChatResponseGen:
608
        raise NotImplementedError
609

610
    def stream_complete(
611
        self, prompt: str, formatted: bool = False, **kwargs: Any
612
    ) -> CompletionResponseGen:
613
        raise NotImplementedError
614

615
    async def achat(
616
        self, messages: Sequence[ChatMessage], **kwargs: Any
617
    ) -> ChatResponse:
618
        raise NotImplementedError
619

620
    async def acomplete(
621
        self, prompt: str, formatted: bool = False, **kwargs: Any
622
    ) -> CompletionResponse:
623
        response = await self._async_client.text_generation(
624
            prompt, **{**{"max_new_tokens": self.num_output}, **kwargs}
625
        )
626
        return CompletionResponse(text=response)
627

628
    async def astream_chat(
629
        self, messages: Sequence[ChatMessage], **kwargs: Any
630
    ) -> ChatResponseAsyncGen:
631
        raise NotImplementedError
632

633
    async def astream_complete(
634
        self, prompt: str, formatted: bool = False, **kwargs: Any
635
    ) -> CompletionResponseAsyncGen:
636
        raise NotImplementedError
637
llama-index

Использование cookies