CSS-LM

convert_graph_to_onnx.py
460 строк · 17.4 Кб
Перенос по словам
1
from argparse import ArgumentParser
2
from os import listdir, makedirs
3
from pathlib import Path
4
from typing import Dict, List, Optional, Tuple
5

6
from packaging.version import Version, parse
7

8
from transformers import is_tf_available, is_torch_available
9
from transformers.file_utils import ModelOutput
10
from transformers.pipelines import Pipeline, pipeline
11
from transformers.tokenization_utils import BatchEncoding
12

13

14
# This is the minimal required version to
15
# support some ONNX Runtime features
16
ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0")
17

18

19
SUPPORTED_PIPELINES = [
20
    "feature-extraction",
21
    "ner",
22
    "sentiment-analysis",
23
    "fill-mask",
24
    "question-answering",
25
    "text-generation",
26
    "translation_en_to_fr",
27
    "translation_en_to_de",
28
    "translation_en_to_ro",
29
]
30

31

32
class OnnxConverterArgumentParser(ArgumentParser):
33
    """
34
    Wraps all the script arguments supported to export transformers models to ONNX IR
35
    """
36

37
    def __init__(self):
38
        super().__init__("ONNX Converter")
39

40
        self.add_argument(
41
            "--pipeline", type=str, choices=SUPPORTED_PIPELINES, default="feature-extraction",
42
        )
43
        self.add_argument(
44
            "--model", type=str, required=True, help="Model's id or path (ex: bert-base-cased)",
45
        )
46
        self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)")
47
        self.add_argument(
48
            "--framework", type=str, choices=["pt", "tf"], help="Framework for loading the model",
49
        )
50
        self.add_argument("--opset", type=int, default=11, help="ONNX opset to use")
51
        self.add_argument(
52
            "--check-loading", action="store_true", help="Check ONNX is able to load the model",
53
        )
54
        self.add_argument(
55
            "--use-external-format", action="store_true", help="Allow exporting model >= than 2Gb",
56
        )
57
        self.add_argument(
58
            "--quantize", action="store_true", help="Quantize the neural network to be run with int8",
59
        )
60
        self.add_argument("output")
61

62

63
def generate_identified_filename(filename: Path, identifier: str) -> Path:
64
    """
65
    Append a string-identifier at the end (before the extension,  if any) to the provided filepath.
66
    Args:
67
        filename: pathlib.Path The actual path object we would like to add an identifier suffix
68
        identifier: The suffix to add
69

70
    Returns: String with concatenated indentifier at the end of the filename
71
    """
72
    return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)
73

74

75
def check_onnxruntime_requirements(minimum_version: Version):
76
    """
77
    Check onnxruntime is installed and if the installed version match is recent enough.
78
    Raises:
79
        ImportError: If onnxruntime is not installed or too old version is found
80
    """
81
    try:
82
        import onnxruntime
83

84
        # Parse the version of the installed onnxruntime
85
        ort_version = parse(onnxruntime.__version__)
86

87
        # We require 1.4.0 minimum
88
        if ort_version < ORT_QUANTIZE_MINIMUM_VERSION:
89
            raise ImportError(
90
                f"We found an older version of onnxruntime ({onnxruntime.__version__}) "
91
                f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n"
92
                f"Please update onnxruntime by running `pip install --upgrade onnxruntime`"
93
            )
94

95
    except ImportError:
96
        raise ImportError(
97
            "onnxruntime doesn't seem to be currently installed. "
98
            "Please install the onnxruntime by running `pip install onnxruntime`"
99
            " and relaunch the conversion."
100
        )
101

102

103
def ensure_valid_input(model, tokens, input_names):
104
    """
105
    Ensure input are presented in the correct order, without any None
106
    Args:
107
        model: The model used to forward the input data
108
        tokens: BatchEncoding holding the input data
109
        input_names: The name of the inputs
110

111
    Returns: Tuple
112

113
    """
114
    print("Ensuring inputs are in correct order")
115

116
    model_args_name = model.forward.__code__.co_varnames
117
    model_args, ordered_input_names = [], []
118
    for arg_name in model_args_name[1:]:  # start at index 1 to skip "self" argument
119
        if arg_name in input_names:
120
            ordered_input_names.append(arg_name)
121
            model_args.append(tokens[arg_name])
122
        else:
123
            print(f"{arg_name} is not present in the generated input list.")
124
            break
125

126
    print("Generated inputs order: {}".format(ordered_input_names))
127
    return ordered_input_names, tuple(model_args)
128

129

130
def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]:
131
    """
132
    Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model.
133
    Args:
134
        nlp: The pipeline object holding the model to be exported
135
        framework: The framework identifier to dispatch to the correct inference scheme (pt/tf)
136

137
    Returns:
138
        - List of the inferred input variable names
139
        - List of the inferred output variable names
140
        - Dictionary with input/output variables names as key and shape tensor as value
141
        - a BatchEncoding reference which was used to infer all the above information
142
    """
143

144
    def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int):
145
        if isinstance(tensor, (tuple, list)):
146
            return [build_shape_dict(name, t, is_input, seq_len) for t in tensor]
147

148
        else:
149
            # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...)
150
            axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"}
151
            if is_input:
152
                if len(tensor.shape) == 2:
153
                    axes[1] = "sequence"
154
                else:
155
                    raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})")
156
            else:
157
                seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len]
158
                axes.update({dim: "sequence" for dim in seq_axes})
159

160
        print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}")
161
        return axes
162

163
    tokens = nlp.tokenizer("This is a sample output", return_tensors=framework)
164
    seq_len = tokens.input_ids.shape[-1]
165
    outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens)
166
    if isinstance(outputs, ModelOutput):
167
        outputs = outputs.to_tuple()
168
    if not isinstance(outputs, (list, tuple)):
169
        outputs = (outputs,)
170

171
    # Generate input names & axes
172
    input_vars = list(tokens.keys())
173
    input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()}
174

175
    # flatten potentially grouped outputs (past for gpt2, attentions)
176
    outputs_flat = []
177
    for output in outputs:
178
        if isinstance(output, (tuple, list)):
179
            outputs_flat.extend(output)
180
        else:
181
            outputs_flat.append(output)
182

183
    # Generate output names & axes
184
    output_names = [f"output_{i}" for i in range(len(outputs_flat))]
185
    output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)}
186

187
    # Create the aggregated axes representation
188
    dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes)
189
    return input_vars, output_names, dynamic_axes, tokens
190

191

192
def load_graph_from_args(pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None) -> Pipeline:
193
    """
194
    Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model)
195
    Args:
196
        pipeline_name: The kind of pipeline to use (ner, question-answering, etc.)
197
        framework: The actual model to convert the pipeline from ("pt" or "tf")
198
        model: The model name which will be loaded by the pipeline
199
        tokenizer: The tokenizer name which will be loaded by the pipeline, defaut to the model's value
200

201
    Returns: Pipeline object
202

203
    """
204
    # If no tokenizer provided
205
    if tokenizer is None:
206
        tokenizer = model
207

208
    # Check the wanted framework is available
209
    if framework == "pt" and not is_torch_available():
210
        raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")
211
    if framework == "tf" and not is_tf_available():
212
        raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")
213

214
    print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})")
215

216
    # Allocate tokenizer and model
217
    return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework)
218

219

220
def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool):
221
    """
222
    Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR)
223
    Args:
224
        nlp: The pipeline to be exported
225
        opset: The actual version of the ONNX operator set to use
226
        output: Path where will be stored the generated ONNX model
227
        use_external_format: Split the model definition from its parameters to allow model bigger than 2GB
228

229
    Returns:
230

231
    """
232
    if not is_torch_available():
233
        raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.")
234

235
    import torch
236
    from torch.onnx import export
237

238
    print(f"Using framework PyTorch: {torch.__version__}")
239

240
    with torch.no_grad():
241
        input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt")
242
        ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names)
243

244
        export(
245
            nlp.model,
246
            model_args,
247
            f=output.as_posix(),
248
            input_names=ordered_input_names,
249
            output_names=output_names,
250
            dynamic_axes=dynamic_axes,
251
            do_constant_folding=True,
252
            use_external_data_format=use_external_format,
253
            enable_onnx_checker=True,
254
            opset_version=opset,
255
        )
256

257

258
def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
259
    """
260
    Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR)
261
    Args:
262
        nlp: The pipeline to be exported
263
        opset: The actual version of the ONNX operator set to use
264
        output: Path where will be stored the generated ONNX model
265

266
    Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow
267

268
    """
269
    if not is_tf_available():
270
        raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.")
271

272
    print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\")
273

274
    try:
275
        import tensorflow as tf
276
        from keras2onnx import convert_keras, save_model, __version__ as k2ov
277

278
        print(f"Using framework TensorFlow: {tf.version.VERSION}, keras2onnx: {k2ov}")
279

280
        # Build
281
        input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf")
282

283
        # Forward
284
        nlp.model.predict(tokens.data)
285
        onnx_model = convert_keras(nlp.model, nlp.model.name, target_opset=opset)
286
        save_model(onnx_model, output.as_posix())
287

288
    except ImportError as e:
289
        raise Exception(f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first.")
290

291

292
def convert(
293
    framework: str,
294
    model: str,
295
    output: Path,
296
    opset: int,
297
    tokenizer: Optional[str] = None,
298
    use_external_format: bool = False,
299
    pipeline_name: str = "feature-extraction",
300
):
301
    """
302
    Convert the pipeline object to the ONNX Intermediate Representation (IR) format.
303
    Args:
304
        framework: The framework the pipeline is backed by ("pt" or "tf")
305
        model: The name of the model to load for the pipeline
306
        output: The path where the ONNX graph will be stored
307
        opset: The actual version of the ONNX operator set to use
308
        tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided
309
        use_external_format: Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only)
310
        pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.)
311

312
    Returns:
313

314
    """
315
    print(f"ONNX opset version set to: {opset}")
316

317
    # Load the pipeline
318
    nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer)
319

320
    if not output.parent.exists():
321
        print(f"Creating folder {output.parent}")
322
        makedirs(output.parent.as_posix())
323
    elif len(listdir(output.parent.as_posix())) > 0:
324
        raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion")
325

326
    # Export the graph
327
    if framework == "pt":
328
        convert_pytorch(nlp, opset, output, use_external_format)
329
    else:
330
        convert_tensorflow(nlp, opset, output)
331

332

333
def optimize(onnx_model_path: Path) -> Path:
334
    """
335
    Load the model at the specified path and let onnxruntime look at transformations on the graph
336
    to enable all the optimizations possible
337
    Args:
338
        onnx_model_path: filepath where the model binary description is stored
339

340
    Returns: Path where the optimized model binary description has been saved
341

342
    """
343
    from onnxruntime import SessionOptions, InferenceSession
344

345
    # Generate model name with suffix "optimized"
346
    opt_model_path = generate_identified_filename(onnx_model_path, "-optimized")
347
    sess_option = SessionOptions()
348
    sess_option.optimized_model_filepath = opt_model_path.as_posix()
349
    _ = InferenceSession(onnx_model_path.as_posix(), sess_option)
350

351
    print(f"Optimized model has been written at {opt_model_path}: \N{heavy check mark}")
352
    print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\")
353

354
    return opt_model_path
355

356

357
def quantize(onnx_model_path: Path) -> Path:
358
    """
359
    Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU.
360
    Args:
361
        onnx_model_path: Path to location the exported ONNX model is stored
362

363
    Returns: The Path generated for the quantized
364
    """
365
    try:
366
        import onnx
367
        from onnxruntime.quantization import quantize, QuantizationMode
368

369
        onnx_model = onnx.load(onnx_model_path.as_posix())
370

371
        # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime
372
        print(
373
            "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n"
374
            "This limitation will be removed in the next release of onnxruntime."
375
        )
376

377
        quantized_model = quantize(
378
            model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True,
379
        )
380

381
        # Append "-quantized" at the end of the model's name
382
        quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized")
383

384
        # Save model
385
        print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}")
386
        onnx.save_model(quantized_model, quantized_model_path.as_posix())
387

388
        return quantized_model_path
389
    except Exception as ie:
390
        print(f"Error while quantizing the model:\n{str(ie)}")
391

392

393
def verify(path: Path):
394
    from onnxruntime import InferenceSession, SessionOptions
395
    from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException
396

397
    print(f"Checking ONNX model loading from: {path} ...")
398
    try:
399
        onnx_options = SessionOptions()
400
        _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"])
401
        print(f"Model {path} correctly loaded: \N{heavy check mark}")
402
    except RuntimeException as re:
403
        print(f"Error while loading the model {re}: \N{heavy ballot x}")
404

405

406
if __name__ == "__main__":
407
    parser = OnnxConverterArgumentParser()
408
    args = parser.parse_args()
409

410
    # Make sure output is absolute path
411
    args.output = Path(args.output).absolute()
412

413
    try:
414
        print("\n====== Converting model to ONNX ======")
415
        # Convert
416
        convert(
417
            args.framework,
418
            args.model,
419
            args.output,
420
            args.opset,
421
            args.tokenizer,
422
            args.use_external_format,
423
            args.pipeline,
424
        )
425

426
        if args.quantize:
427
            # Ensure requirements for quantization on onnxruntime is met
428
            check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION)
429

430
            # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch
431
            if args.framework == "tf":
432
                print(
433
                    "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n"
434
                    "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n"
435
                    "\t For more information, please refer to the onnxruntime documentation:\n"
436
                    "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n"
437
                )
438

439
            print("\n====== Optimizing ONNX model ======")
440

441
            # Quantization works best when using the optimized version of the model
442
            args.optimized_output = optimize(args.output)
443

444
            # Do the quantization on the right graph
445
            args.quantized_output = quantize(args.optimized_output)
446

447
        # And verify
448
        if args.check_loading:
449
            print("\n====== Check exported ONNX model(s) ======")
450
            verify(args.output)
451

452
            if hasattr(args, "optimized_output"):
453
                verify(args.optimized_output)
454

455
            if hasattr(args, "quantized_output"):
456
                verify(args.quantized_output)
457

458
    except Exception as e:
459
        print(f"Error while converting the model: {e}")
460
        exit(1)
461
CSS-LM

Использование cookies