autogen

process_notebooks.py
523 строки · 17.6 Кб
Перенос по словам
1
#!/usr/bin/env python
2

3
from __future__ import annotations
4

5
import argparse
6
import concurrent.futures
7
import json
8
import os
9
import shutil
10
import signal
11
import subprocess
12
import sys
13
import tempfile
14
import threading
15
import time
16
import typing
17
from dataclasses import dataclass
18
from multiprocessing import current_process
19
from pathlib import Path
20
from typing import Dict, Optional, Tuple, Union
21

22
from termcolor import colored
23

24
try:
25
    import yaml
26
except ImportError:
27
    print("pyyaml not found.\n\nPlease install pyyaml:\n\tpip install pyyaml\n")
28
    sys.exit(1)
29

30
try:
31
    import nbclient
32
    from nbclient.client import (
33
        CellExecutionError,
34
        CellTimeoutError,
35
        NotebookClient,
36
    )
37
except ImportError:
38
    if current_process().name == "MainProcess":
39
        print("nbclient not found.\n\nPlease install nbclient:\n\tpip install nbclient\n")
40
        print("test won't work without nbclient")
41

42
try:
43
    import nbformat
44
    from nbformat import NotebookNode
45
except ImportError:
46
    if current_process().name == "MainProcess":
47
        print("nbformat not found.\n\nPlease install nbformat:\n\tpip install nbformat\n")
48
        print("test won't work without nbclient")
49

50

51
class Result:
52
    def __init__(self, returncode: int, stdout: str, stderr: str):
53
        self.returncode = returncode
54
        self.stdout = stdout
55
        self.stderr = stderr
56

57

58
def check_quarto_bin(quarto_bin: str = "quarto") -> None:
59
    """Check if quarto is installed."""
60
    try:
61
        version = subprocess.check_output([quarto_bin, "--version"], text=True).strip()
62
        version = tuple(map(int, version.split(".")))
63
        if version < (1, 5, 23):
64
            print("Quarto version is too old. Please upgrade to 1.5.23 or later.")
65
            sys.exit(1)
66

67
    except FileNotFoundError:
68
        print("Quarto is not installed. Please install it from https://quarto.org")
69
        sys.exit(1)
70

71

72
def notebooks_target_dir(website_directory: Path) -> Path:
73
    """Return the target directory for notebooks."""
74
    return website_directory / "docs" / "notebooks"
75

76

77
def load_metadata(notebook: Path) -> typing.Dict:
78
    content = json.load(notebook.open(encoding="utf-8"))
79
    return content["metadata"]
80

81

82
def skip_reason_or_none_if_ok(notebook: Path) -> typing.Optional[str]:
83
    """Return a reason to skip the notebook, or None if it should not be skipped."""
84

85
    if notebook.suffix != ".ipynb":
86
        return "not a notebook"
87

88
    if not notebook.exists():
89
        return "file does not exist"
90

91
    # Extra checks for notebooks in the notebook directory
92
    if "notebook" not in notebook.parts:
93
        return None
94

95
    with open(notebook, "r", encoding="utf-8") as f:
96
        content = f.read()
97

98
    # Load the json and get the first cell
99
    json_content = json.loads(content)
100
    first_cell = json_content["cells"][0]
101

102
    # <!-- and --> must exists on lines on their own
103
    if first_cell["cell_type"] == "markdown" and first_cell["source"][0].strip() == "<!--":
104
        raise ValueError(
105
            f"Error in {str(notebook.resolve())} - Front matter should be defined in the notebook metadata now."
106
        )
107

108
    metadata = load_metadata(notebook)
109

110
    if "skip_render" in metadata:
111
        return metadata["skip_render"]
112

113
    if "front_matter" not in metadata:
114
        return "front matter missing from notebook metadata ⚠️"
115

116
    front_matter = metadata["front_matter"]
117

118
    if "tags" not in front_matter:
119
        return "tags is not in front matter"
120

121
    if "description" not in front_matter:
122
        return "description is not in front matter"
123

124
    # Make sure tags is a list of strings
125
    if not all([isinstance(tag, str) for tag in front_matter["tags"]]):
126
        return "tags must be a list of strings"
127

128
    # Make sure description is a string
129
    if not isinstance(front_matter["description"], str):
130
        return "description must be a string"
131

132
    return None
133

134

135
def extract_title(notebook: Path) -> Optional[str]:
136
    """Extract the title of the notebook."""
137
    with open(notebook, "r", encoding="utf-8") as f:
138
        content = f.read()
139

140
    # Load the json and get the first cell
141
    json_content = json.loads(content)
142
    first_cell = json_content["cells"][0]
143

144
    # find the # title
145
    for line in first_cell["source"]:
146
        if line.startswith("# "):
147
            title = line[2:].strip()
148
            # Strip off the { if it exists
149
            if "{" in title:
150
                title = title[: title.find("{")].strip()
151
            return title
152

153
    return None
154

155

156
def process_notebook(src_notebook: Path, website_dir: Path, notebook_dir: Path, quarto_bin: str, dry_run: bool) -> str:
157
    """Process a single notebook."""
158

159
    in_notebook_dir = "notebook" in src_notebook.parts
160

161
    metadata = load_metadata(src_notebook)
162

163
    title = extract_title(src_notebook)
164
    if title is None:
165
        return fmt_error(src_notebook, "Title not found in notebook")
166

167
    front_matter = {}
168
    if "front_matter" in metadata:
169
        front_matter = metadata["front_matter"]
170

171
    front_matter["title"] = title
172

173
    if in_notebook_dir:
174
        relative_notebook = src_notebook.resolve().relative_to(notebook_dir.resolve())
175
        dest_dir = notebooks_target_dir(website_directory=website_dir)
176
        target_file = dest_dir / relative_notebook.with_suffix(".mdx")
177
        intermediate_notebook = dest_dir / relative_notebook
178

179
        # If the intermediate_notebook already exists, check if it is newer than the source file
180
        if target_file.exists():
181
            if target_file.stat().st_mtime > src_notebook.stat().st_mtime:
182
                return fmt_skip(src_notebook, f"target file ({target_file.name}) is newer ☑️")
183

184
        if dry_run:
185
            return colored(f"Would process {src_notebook.name}", "green")
186

187
        # Copy notebook to target dir
188
        # The reason we copy the notebook is that quarto does not support rendering from a different directory
189
        shutil.copy(src_notebook, intermediate_notebook)
190

191
        # Check if another file has to be copied too
192
        # Solely added for the purpose of agent_library_example.json
193
        if "extra_files_to_copy" in metadata:
194
            for file in metadata["extra_files_to_copy"]:
195
                shutil.copy(src_notebook.parent / file, dest_dir / file)
196

197
        # Capture output
198
        result = subprocess.run(
199
            [quarto_bin, "render", intermediate_notebook], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
200
        )
201
        if result.returncode != 0:
202
            return fmt_error(
203
                src_notebook, f"Failed to render {src_notebook}\n\nstderr:\n{result.stderr}\nstdout:\n{result.stdout}"
204
            )
205

206
        # Unlink intermediate files
207
        intermediate_notebook.unlink()
208
    else:
209
        target_file = src_notebook.with_suffix(".mdx")
210

211
        # If the intermediate_notebook already exists, check if it is newer than the source file
212
        if target_file.exists():
213
            if target_file.stat().st_mtime > src_notebook.stat().st_mtime:
214
                return fmt_skip(src_notebook, f"target file ({target_file.name}) is newer ☑️")
215

216
        if dry_run:
217
            return colored(f"Would process {src_notebook.name}", "green")
218

219
        result = subprocess.run(
220
            [quarto_bin, "render", src_notebook], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
221
        )
222
        if result.returncode != 0:
223
            return fmt_error(
224
                src_notebook, f"Failed to render {src_notebook}\n\nstderr:\n{result.stderr}\nstdout:\n{result.stdout}"
225
            )
226

227
    post_process_mdx(target_file, src_notebook, front_matter)
228

229
    return fmt_ok(src_notebook)
230

231

232
# Notebook execution based on nbmake: https://github.com/treebeardtech/nbmakes
233
@dataclass
234
class NotebookError:
235
    error_name: str
236
    error_value: Optional[str]
237
    traceback: str
238
    cell_source: str
239

240

241
@dataclass
242
class NotebookSkip:
243
    reason: str
244

245

246
NB_VERSION = 4
247

248

249
def test_notebook(notebook_path: Path, timeout: int = 300) -> Tuple[Path, Optional[Union[NotebookError, NotebookSkip]]]:
250
    nb = nbformat.read(str(notebook_path), NB_VERSION)
251

252
    if "skip_test" in nb.metadata:
253
        return notebook_path, NotebookSkip(reason=nb.metadata.skip_test)
254

255
    try:
256
        c = NotebookClient(
257
            nb,
258
            timeout=timeout,
259
            allow_errors=False,
260
            record_timing=True,
261
        )
262
        os.environ["PYDEVD_DISABLE_FILE_VALIDATION"] = "1"
263
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
264
        with tempfile.TemporaryDirectory() as tempdir:
265
            c.execute(cwd=tempdir)
266
    except CellExecutionError:
267
        error = get_error_info(nb)
268
        assert error is not None
269
        return notebook_path, error
270
    except CellTimeoutError:
271
        error = get_timeout_info(nb)
272
        assert error is not None
273
        return notebook_path, error
274

275
    return notebook_path, None
276

277

278
# Find the first code cell which did not complete.
279
def get_timeout_info(
280
    nb: NotebookNode,
281
) -> Optional[NotebookError]:
282
    for i, cell in enumerate(nb.cells):
283
        if cell.cell_type != "code":
284
            continue
285
        if "shell.execute_reply" not in cell.metadata.execution:
286
            return NotebookError(
287
                error_name="timeout",
288
                error_value="",
289
                traceback="",
290
                cell_source="".join(cell["source"]),
291
            )
292

293
    return None
294

295

296
def get_error_info(nb: NotebookNode) -> Optional[NotebookError]:
297
    for cell in nb["cells"]:  # get LAST error
298
        if cell["cell_type"] != "code":
299
            continue
300
        errors = [output for output in cell["outputs"] if output["output_type"] == "error" or "ename" in output]
301

302
        if errors:
303
            traceback = "\n".join(errors[0].get("traceback", ""))
304
            return NotebookError(
305
                error_name=errors[0].get("ename", ""),
306
                error_value=errors[0].get("evalue", ""),
307
                traceback=traceback,
308
                cell_source="".join(cell["source"]),
309
            )
310
    return None
311

312

313
# rendered_notebook is the final mdx file
314
def post_process_mdx(rendered_mdx: Path, source_notebooks: Path, front_matter: Dict) -> None:
315
    with open(rendered_mdx, "r", encoding="utf-8") as f:
316
        content = f.read()
317

318
    # If there is front matter in the mdx file, we need to remove it
319
    if content.startswith("---"):
320
        front_matter_end = content.find("---", 3)
321
        front_matter = yaml.safe_load(content[4:front_matter_end])
322
        content = content[front_matter_end + 3 :]
323

324
    # Each intermediate path needs to be resolved for this to work reliably
325
    repo_root = Path(__file__).parent.resolve().parent.resolve()
326
    repo_relative_notebook = source_notebooks.resolve().relative_to(repo_root)
327
    front_matter["source_notebook"] = f"/{repo_relative_notebook}"
328
    front_matter["custom_edit_url"] = f"https://github.com/microsoft/autogen/edit/main/{repo_relative_notebook}"
329

330
    # Is there a title on the content? Only search up until the first code cell
331
    first_code_cell = content.find("```")
332
    if first_code_cell != -1:
333
        title_search_content = content[:first_code_cell]
334
    else:
335
        title_search_content = content
336

337
    title_exists = title_search_content.find("\n# ") != -1
338
    if not title_exists:
339
        content = f"# {front_matter['title']}\n{content}"
340

341
    # inject in content directly after the markdown title the word done
342
    # Find the end of the line with the title
343
    title_end = content.find("\n", content.find("#"))
344

345
    # Extract page title
346
    title = content[content.find("#") + 1 : content.find("\n", content.find("#"))].strip()
347
    # If there is a { in the title we trim off the { and everything after it
348
    if "{" in title:
349
        title = title[: title.find("{")].strip()
350

351
    github_link = f"https://github.com/microsoft/autogen/blob/main/{repo_relative_notebook}"
352
    content = (
353
        content[:title_end]
354
        + "\n[![Open on GitHub](https://img.shields.io/badge/Open%20on%20GitHub-grey?logo=github)]("
355
        + github_link
356
        + ")"
357
        + content[title_end:]
358
    )
359

360
    # If no colab link is present, insert one
361
    if "colab-badge.svg" not in content:
362
        colab_link = f"https://colab.research.google.com/github/microsoft/autogen/blob/main/{repo_relative_notebook}"
363
        content = (
364
            content[:title_end]
365
            + "\n[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)]("
366
            + colab_link
367
            + ")"
368
            + content[title_end:]
369
        )
370

371
    # Dump front_matter to ysaml
372
    front_matter = yaml.dump(front_matter, default_flow_style=False)
373

374
    # Rewrite the content as
375
    # ---
376
    # front_matter
377
    # ---
378
    # content
379
    new_content = f"---\n{front_matter}---\n{content}"
380
    with open(rendered_mdx, "w", encoding="utf-8") as f:
381
        f.write(new_content)
382

383

384
def path(path_str: str) -> Path:
385
    """Return a Path object."""
386
    return Path(path_str)
387

388

389
def collect_notebooks(notebook_directory: Path, website_directory: Path) -> typing.List[Path]:
390
    notebooks = list(notebook_directory.glob("*.ipynb"))
391
    notebooks.extend(list(website_directory.glob("docs/**/*.ipynb")))
392
    return notebooks
393

394

395
def fmt_skip(notebook: Path, reason: str) -> str:
396
    return f"{colored('[Skip]', 'yellow')} {colored(notebook.name, 'blue')}: {reason}"
397

398

399
def fmt_ok(notebook: Path) -> str:
400
    return f"{colored('[OK]', 'green')} {colored(notebook.name, 'blue')} ✅"
401

402

403
def fmt_error(notebook: Path, error: Union[NotebookError, str]) -> str:
404
    if isinstance(error, str):
405
        return f"{colored('[Error]', 'red')} {colored(notebook.name, 'blue')}: {error}"
406
    elif isinstance(error, NotebookError):
407
        return f"{colored('[Error]', 'red')} {colored(notebook.name, 'blue')}: {error.error_name} - {error.error_value}"
408
    else:
409
        raise ValueError("error must be a string or a NotebookError")
410

411

412
def start_thread_to_terminate_when_parent_process_dies(ppid: int):
413
    pid = os.getpid()
414

415
    def f() -> None:
416
        while True:
417
            try:
418
                os.kill(ppid, 0)
419
            except OSError:
420
                os.kill(pid, signal.SIGTERM)
421
            time.sleep(1)
422

423
    thread = threading.Thread(target=f, daemon=True)
424
    thread.start()
425

426

427
def main() -> None:
428
    script_dir = Path(__file__).parent.absolute()
429
    parser = argparse.ArgumentParser()
430
    subparsers = parser.add_subparsers(dest="subcommand")
431

432
    parser.add_argument(
433
        "--notebook-directory",
434
        type=path,
435
        help="Directory containing notebooks to process",
436
        default=script_dir / "../notebook",
437
    )
438
    parser.add_argument(
439
        "--website-directory", type=path, help="Root directory of docusarus website", default=script_dir
440
    )
441

442
    render_parser = subparsers.add_parser("render")
443
    render_parser.add_argument("--quarto-bin", help="Path to quarto binary", default="quarto")
444
    render_parser.add_argument("--dry-run", help="Don't render", action="store_true")
445
    render_parser.add_argument("notebooks", type=path, nargs="*", default=None)
446

447
    test_parser = subparsers.add_parser("test")
448
    test_parser.add_argument("--timeout", help="Timeout for each notebook", type=int, default=60)
449
    test_parser.add_argument("--exit-on-first-fail", "-e", help="Exit after first test fail", action="store_true")
450
    test_parser.add_argument("notebooks", type=path, nargs="*", default=None)
451
    test_parser.add_argument("--workers", help="Number of workers to use", type=int, default=-1)
452

453
    args = parser.parse_args()
454

455
    if args.subcommand is None:
456
        print("No subcommand specified")
457
        sys.exit(1)
458

459
    if args.notebooks:
460
        collected_notebooks = args.notebooks
461
    else:
462
        collected_notebooks = collect_notebooks(args.notebook_directory, args.website_directory)
463

464
    filtered_notebooks = []
465
    for notebook in collected_notebooks:
466
        reason = skip_reason_or_none_if_ok(notebook)
467
        if reason:
468
            print(fmt_skip(notebook, reason))
469
        else:
470
            filtered_notebooks.append(notebook)
471

472
    if args.subcommand == "test":
473
        if args.workers == -1:
474
            args.workers = None
475
        failure = False
476
        with concurrent.futures.ProcessPoolExecutor(
477
            max_workers=args.workers,
478
            initializer=start_thread_to_terminate_when_parent_process_dies,
479
            initargs=(os.getpid(),),
480
        ) as executor:
481
            futures = [executor.submit(test_notebook, f, args.timeout) for f in filtered_notebooks]
482
            for future in concurrent.futures.as_completed(futures):
483
                notebook, optional_error_or_skip = future.result()
484
                if isinstance(optional_error_or_skip, NotebookError):
485
                    if optional_error_or_skip.error_name == "timeout":
486
                        print(fmt_error(notebook, optional_error_or_skip.error_name))
487

488
                    else:
489
                        print("-" * 80)
490

491
                        print(fmt_error(notebook, optional_error_or_skip))
492
                        print(optional_error_or_skip.traceback)
493
                        print("-" * 80)
494
                    if args.exit_on_first_fail:
495
                        sys.exit(1)
496
                    failure = True
497
                elif isinstance(optional_error_or_skip, NotebookSkip):
498
                    print(fmt_skip(notebook, optional_error_or_skip.reason))
499
                else:
500
                    print(fmt_ok(notebook))
501

502
        if failure:
503
            sys.exit(1)
504

505
    elif args.subcommand == "render":
506
        check_quarto_bin(args.quarto_bin)
507

508
        if not notebooks_target_dir(args.website_directory).exists():
509
            notebooks_target_dir(args.website_directory).mkdir(parents=True)
510

511
        for notebook in filtered_notebooks:
512
            print(
513
                process_notebook(
514
                    notebook, args.website_directory, args.notebook_directory, args.quarto_bin, args.dry_run
515
                )
516
            )
517
    else:
518
        print("Unknown subcommand")
519
        sys.exit(1)
520

521

522
if __name__ == "__main__":
523
    main()
524
autogen

Использование cookies