allennlp

py2md.py
532 строки · 18.3 Кб
Перенос по словам
1
#!/usr/bin/env python
2

3
"""
4
Turn docstrings from a single module into a markdown file.
5

6
We do this with PydocMarkdown, using custom processors and renderers defined here.
7
"""
8

9
import argparse
10
import logging
11
import os
12
import re
13
import sys
14
from collections import OrderedDict
15
from dataclasses import dataclass
16
from enum import Enum
17
from multiprocessing import Pool, cpu_count
18
from pathlib import Path
19
from typing import List, Optional, Tuple
20

21
import docspec
22
from docspec import ApiObject, Class, Data, Function, Indirection, Module
23
from docspec_python import format_arglist
24
from pydoc_markdown import Processor, PydocMarkdown, Resolver
25
from pydoc_markdown.contrib.loaders.python import PythonLoader
26
from pydoc_markdown.contrib.renderers.markdown import MarkdownRenderer
27

28
logging.basicConfig(level=logging.INFO)
29
logger = logging.getLogger("py2md")
30
BASE_MODULE = os.environ.get("BASE_MODULE", "allennlp")
31
BASE_SOURCE_LINK = os.environ.get(
32
    "BASE_SOURCE_LINK", "https://github.com/allenai/allennlp/blob/main/allennlp/"
33
)
34

35

36
class DocstringError(Exception):
37
    pass
38

39

40
def emphasize(s: str) -> str:
41
    # Need to escape underscores.
42
    s = s.replace("_", "\\_")
43
    return f"__{s}__"
44

45

46
class Section(Enum):
47
    ARGUMENTS = "ARGUMENTS"
48
    PARAMETERS = "PARAMETERS"
49
    ATTRIBUTES = "ATTRIBUTES"
50
    MEMBERS = "MEMBERS"
51
    RETURNS = "RETURNS"
52
    RAISES = "RAISES"
53
    EXAMPLES = "EXAMPLES"
54
    OTHER = "OTHER"
55

56
    @classmethod
57
    def from_str(cls, section: str) -> "Section":
58
        section = section.upper()
59
        for member in cls:
60
            if section == member.value:
61
                return member
62
        return cls.OTHER
63

64

65
REQUIRED_PARAM_RE = re.compile(r"^`([^`]+)`(, required\.?)?$")
66

67
OPTIONAL_PARAM_RE = re.compile(
68
    r"^`([^`]+)`,?\s+(optional,?\s)?\(\s?(optional,\s)?default\s?=\s?`([^`]+)`\s?\)\.?$"
69
)
70

71
OPTIONAL_PARAM_NO_DEFAULT_RE = re.compile(r"^`([^`]+)`,?\s+optional\.?$")
72

73

74
@dataclass
75
class Param:
76
    ident: str
77
    ty: Optional[str] = None
78
    required: bool = False
79
    default: Optional[str] = None
80

81
    @classmethod
82
    def from_line(cls, line: str) -> Optional["Param"]:
83
        if ":" not in line:
84
            return None
85

86
        ident, description = line.split(":", 1)
87
        ident = ident.strip()
88
        description = description.strip()
89

90
        if " " in ident:
91
            return None
92

93
        maybe_match = REQUIRED_PARAM_RE.match(description)
94
        if maybe_match:
95
            ty = maybe_match.group(1)
96
            return cls(ident=ident, ty=ty, required=True)
97

98
        maybe_match = OPTIONAL_PARAM_RE.match(description)
99
        if maybe_match:
100
            ty = maybe_match.group(1)
101
            default = maybe_match.group(4)
102
            return cls(ident=ident, ty=ty, required=False, default=default)
103

104
        maybe_match = OPTIONAL_PARAM_NO_DEFAULT_RE.match(description)
105
        if maybe_match:
106
            ty = maybe_match.group(1)
107
            return cls(ident=ident, ty=ty, required=False)
108

109
        raise DocstringError(
110
            f"Invalid parameter / attribute description: '{line}'\n"
111
            "Make sure types are enclosed in backticks.\n"
112
            "Required parameters should be documented like: '{ident} : `{type}`'\n"
113
            "Optional parameters should be documented like: '{ident} : `{type}`, optional (default = `{expr}`)'\n"
114
        )
115

116
    def to_line(self) -> str:
117
        line: str = f"- {emphasize(self.ident)} :"
118
        if self.ty:
119
            line += f" `{self.ty}`"
120
            if not self.required:
121
                line += ", optional"
122
                if self.default:
123
                    line += f" (default = `{self.default}`)"
124
        line += " <br>"
125
        return line
126

127

128
# For now we handle attributes / members in the same way as parameters / arguments.
129
Attrib = Param
130

131

132
@dataclass
133
class RetVal:
134
    description: Optional[str] = None
135
    ident: Optional[str] = None
136
    ty: Optional[str] = None
137

138
    @classmethod
139
    def from_line(cls, line: str) -> "RetVal":
140
        if ": " not in line:
141
            return cls(description=line)
142
        ident, ty = line.split(":", 1)
143
        ident = ident.strip()
144
        ty = ty.strip()
145
        if ty and not ty.startswith("`"):
146
            raise DocstringError(f"Type should be enclosed in backticks: '{line}'")
147
        return cls(ident=ident, ty=ty)
148

149
    def to_line(self) -> str:
150
        if self.description:
151
            line = f"- {self.description} <br>"
152
        elif self.ident:
153
            line = f"- {emphasize(self.ident)}"
154
            if self.ty:
155
                line += f" : {self.ty} <br>"
156
            else:
157
                line += " <br>"
158
        else:
159
            raise DocstringError("RetVal must have either description or ident")
160
        return line
161

162

163
@dataclass
164
class ProcessorState:
165
    parameters: "OrderedDict[str, Param]"
166
    current_section: Optional[Section] = None
167
    codeblock_opened: bool = False
168
    consecutive_blank_line_count: int = 0
169

170

171
@dataclass
172
class AllenNlpDocstringProcessor(Processor):
173
    """
174
    Use to turn our docstrings into Markdown.
175
    """
176

177
    CROSS_REF_RE = re.compile("(:(class|func|mod):`~?([a-zA-Z0-9_.]+)`)")
178
    UNDERSCORE_HEADER_RE = re.compile(r"(.*)\n-{3,}\n")
179
    MULTI_LINE_LINK_RE = re.compile(r"(\[[^\]]+\])\n\s*(\([^\)]+\))")
180

181
    def process(self, modules: List[Module], resolver: Optional[Resolver]) -> None:
182
        docspec.visit(modules, self.process_node)
183

184
    def process_node(self, node: docspec.ApiObject):
185
        if not getattr(node, "docstring", None):
186
            return
187

188
        lines: List[str] = []
189
        state: ProcessorState = ProcessorState(parameters=OrderedDict())
190

191
        docstring = node.docstring
192

193
        # Standardize header syntax to use '#' instead of underscores.
194
        docstring = self.UNDERSCORE_HEADER_RE.sub(r"# \g<1>", docstring)
195

196
        # It's common to break up markdown links into multiple lines in docstrings, but
197
        # they won't render as links in the doc HTML unless they are all on one line.
198
        docstring = self.MULTI_LINE_LINK_RE.sub(r"\g<1>\g<2>", docstring)
199

200
        for line in docstring.split("\n"):
201
            # Check if we're starting or ending a codeblock.
202
            if line.startswith("```"):
203
                state.codeblock_opened = not state.codeblock_opened
204

205
            if not state.codeblock_opened:
206
                # If we're not in a codeblock, we'll do some pre-processing.
207
                if not line.strip():
208
                    state.consecutive_blank_line_count += 1
209
                    if state.consecutive_blank_line_count >= 2:
210
                        state.current_section = None
211
                else:
212
                    state.consecutive_blank_line_count = 0
213
                line = self._preprocess_line(node, line, state)
214

215
            lines.append(line)
216

217
        # Now set the docstring to our preprocessed version of it.
218
        node.docstring = "\n".join(lines)
219

220
    def _preprocess_line(self, node, line, state: ProcessorState) -> str:
221
        match = re.match(r"#+ (.*)$", line)
222
        if match:
223
            state.current_section = Section.from_str(match.group(1).strip())
224
            name = match.group(1).strip()
225
            slug = (node.name + "." + match.group(1).strip()).lower().replace(" ", "_")
226
            line = f'<h4 id="{slug}">{name}<a class="headerlink" href="#{slug}" title="Permanent link">&para;</a></h4>\n'  # noqa: E501
227
        else:
228
            if line and not line.startswith(" ") and not line.startswith("!!! "):
229
                if state.current_section in (
230
                    Section.ARGUMENTS,
231
                    Section.PARAMETERS,
232
                ):
233
                    param = Param.from_line(line)
234
                    if param:
235
                        line = param.to_line()
236
                elif state.current_section in (Section.ATTRIBUTES, Section.MEMBERS):
237
                    attrib = Attrib.from_line(line)
238
                    if attrib:
239
                        line = attrib.to_line()
240
                elif state.current_section in (Section.RETURNS, Section.RAISES):
241
                    retval = RetVal.from_line(line)
242
                    line = retval.to_line()
243

244
            line = self._transform_cross_references(line)
245

246
        return line
247

248
    def _transform_cross_references(self, line: str) -> str:
249
        """
250
        Replace sphinx style crossreferences with markdown links.
251
        """
252
        for match, ty, name in self.CROSS_REF_RE.findall(line):
253
            if name.startswith(f"{BASE_MODULE}."):
254
                path = name.split(".")
255
                if ty == "mod":
256
                    href = "/api/" + "/".join(path[1:])
257
                else:
258
                    href = "/api/" + "/".join(path[1:-1]) + "/#" + path[-1].lower()
259
                cross_ref = f"[`{path[-1]}`]({href})"
260
            elif "." not in name:
261
                cross_ref = f"[`{name}`](#{name.lower()})"
262
            else:
263
                cross_ref = f"`{name}`"
264
            line = line.replace(match, cross_ref)
265
        return line
266

267

268
@dataclass
269
class AllenNlpFilterProcessor(Processor):
270
    """
271
    Used to filter out nodes that we don't want to document.
272
    """
273

274
    PRIVATE_METHODS_TO_KEEP = {
275
        "DatasetReader._read",
276
        "__init__",
277
        "__call__",
278
        "__iter__",
279
        "InfluenceInterpreter._calculate_influence_scores",
280
        "TransformerModule._from_config",
281
        "TransformerModule._pretrained_mapping",
282
        "TransformerModule._pretrained_relevant_module",
283
        "TransformerModule._pretrained_ignore",
284
        "TransformerModule._pretrained_allow_missing",
285
        "TransformerModule._distributed_loading_strategy",
286
        "Constraint._update_state",
287
        "Module._post_load_state_dict",
288
    }
289

290
    def process(self, modules: List[Module], resolver: Optional[Resolver]) -> None:
291
        docspec.filter_visit(modules, self._check)
292

293
    def _check(self, node: ApiObject) -> bool:
294
        if node.name.startswith("_"):
295
            if node.name in self.PRIVATE_METHODS_TO_KEEP:
296
                return True
297
            if node.parent and f"{node.parent.name}.{node.name}" in self.PRIVATE_METHODS_TO_KEEP:
298
                return True
299
            return False
300
        if node.parent and node.parent.name.startswith("_"):
301
            return False
302
        if node.name == "logger" and isinstance(node.parent, Module):
303
            return False
304
        return True
305

306

307
class AllenNlpRenderer(MarkdownRenderer):
308
    def _format_function_signature(
309
        self,
310
        func: Function,
311
        override_name: str = None,
312
        add_method_bar: bool = True,
313
        include_parent_class: bool = True,
314
    ) -> str:
315
        parts = []
316
        for dec in func.decorations:
317
            parts.append("@{}{}\n".format(dec.name, dec.args or ""))
318
        if func.modifiers and "async" in func.modifiers:
319
            parts.append("async ")
320
        if self.signature_with_def:
321
            parts.append("def ")
322
        if self.signature_class_prefix and (
323
            func.is_function() and func.parent and func.parent.is_class()
324
        ):
325
            parts.append(func.parent.name + ".")
326
        parts.append((override_name or func.name))
327
        signature_args = format_arglist(func.args)
328
        if signature_args.endswith(","):
329
            signature_args = signature_args[:-1].strip()
330

331
        if (
332
            len(parts[-1])
333
            + len(signature_args)
334
            + (0 if not func.return_type else len(str(func.return_type)))
335
            > 60
336
        ):
337
            signature_args = ",\n    ".join(
338
                filter(
339
                    lambda s: s.strip() not in ("", ","),
340
                    (format_arglist([arg]) for arg in func.args),
341
                )
342
            )
343
            parts.append("(\n    " + signature_args + "\n)")
344
        else:
345
            parts.append("(" + signature_args + ")")
346

347
        if func.return_type:
348
            parts.append(" -> {}".format(func.return_type))
349
        result = "".join(parts)
350
        if add_method_bar and isinstance(func.parent, Class):
351
            result = "\n".join(" | " + line for line in result.split("\n"))
352
            if include_parent_class:
353
                bases = ", ".join(map(str, func.parent.bases))
354
                if func.parent.metaclass:
355
                    bases += ", metaclass=" + str(func.parent.metaclass)
356
                if bases:
357
                    class_signature = f"class {func.parent.name}({bases})"
358
                else:
359
                    class_signature = f"class {func.parent.name}"
360
                result = f"{class_signature}:\n | ...\n{result}"
361
        return result
362

363
    def _format_data_signature(self, data: Data) -> str:
364
        expr = data.value
365
        if expr and len(expr) > self.data_expression_maxlength:
366
            expr = expr[: self.data_expression_maxlength] + " ..."
367

368
        if data.datatype:
369
            signature = f"{data.name}: {data.datatype} = {expr}"
370
        else:
371
            signature = f"{data.name} = {expr}"
372

373
        if data.parent and isinstance(data.parent, Class):
374
            bases = ", ".join(map(str, data.parent.bases))
375
            if data.parent.metaclass:
376
                bases += ", metaclass=" + str(data.parent.metaclass)
377
            if bases:
378
                class_signature = f"class {data.parent.name}({bases})"
379
            else:
380
                class_signature = f"class {data.parent.name}"
381
            return f"{class_signature}:\n | ...\n | {signature}"
382
        else:
383
            return signature
384

385
    def _format_classdef_signature(self, cls: Class) -> str:
386
        code = ""
387
        if cls.decorations:
388
            for dec in cls.decorations:
389
                code += "@{}{}\n".format(dec.name, dec.args or "")
390
        bases = ", ".join(map(str, cls.bases))
391
        if cls.metaclass:
392
            bases += ", metaclass=" + str(cls.metaclass)
393
        if bases:
394
            code += "class {}({})".format(cls.name, bases)
395
        else:
396
            code += "class {}".format(cls.name)
397
        if self.signature_python_help_style:
398
            code = cls.path() + " = " + code
399
        members = {m.name: m for m in cls.members}
400
        if self.classdef_render_init_signature_if_needed and ("__init__" in members):
401
            code += ":\n" + self._format_function_signature(
402
                members["__init__"],
403
                add_method_bar=True,
404
                include_parent_class=False,
405
            )
406
        return code
407

408
    def _render_module_breadcrumbs(self, fp, mod: Module):
409
        submods = mod.name.split(".")
410
        breadcrumbs = []
411
        for i, submod_name in enumerate(submods):
412
            if i == 0:
413
                title = f"<i>{submod_name}</i>"
414
            elif i == len(submods) - 1:
415
                title = f"<strong>.{submod_name}</strong>"
416
            else:
417
                title = f"<i>.{submod_name}</i>"
418
            breadcrumbs.append(title)
419
        "/".join(submods[1:])
420
        source_link = BASE_SOURCE_LINK + "/".join(submods[1:]) + ".py"
421
        fp.write(
422
            "<div>\n"
423
            ' <p class="alignleft">' + "".join(breadcrumbs) + "</p>\n"
424
            f' <p class="alignright"><a class="sourcelink" href="{source_link}">[SOURCE]</a></p>\n'
425
            "</div>\n"
426
            '<div style="clear: both;"></div>\n\n---\n\n'
427
        )
428

429
    def _render_object(self, fp, level, obj):
430
        if isinstance(obj, Indirection) or isinstance(obj, Function) and obj.name == "__init__":
431
            return
432
        if not isinstance(obj, Module) or self.render_module_header:
433
            self._render_header(fp, level, obj)
434
        if isinstance(obj, Module):
435
            self._render_module_breadcrumbs(fp, obj)
436
        self._render_signature_block(fp, obj)
437
        if obj.docstring:
438
            lines = obj.docstring.split("\n")
439
            if self.docstrings_as_blockquote:
440
                lines = ["> " + x for x in lines]
441
            fp.write("\n".join(lines))
442
            fp.write("\n\n")
443

444

445
def py2md(module: str, out: Optional[str] = None) -> bool:
446
    """
447
    Returns `True` if module successfully processed, otherwise `False`.
448
    """
449
    logger.debug("Processing %s", module)
450
    pydocmd = PydocMarkdown(
451
        loaders=[PythonLoader(modules=[module])],
452
        processors=[AllenNlpFilterProcessor(), AllenNlpDocstringProcessor()],
453
        renderer=AllenNlpRenderer(
454
            filename=out,
455
            add_method_class_prefix=False,
456
            add_member_class_prefix=False,
457
            data_code_block=True,
458
            signature_with_def=True,
459
            signature_with_vertical_bar=True,
460
            use_fixed_header_levels=False,
461
            render_module_header=False,
462
            descriptive_class_title=False,
463
            classdef_with_decorators=True,
464
            classdef_render_init_signature_if_needed=True,
465
        ),
466
    )
467
    if out:
468
        out_path = Path(out)
469
        os.makedirs(out_path.parent, exist_ok=True)
470

471
    modules = pydocmd.load_modules()
472
    try:
473
        pydocmd.process(modules)
474
    except DocstringError as err:
475
        logger.exception("Failed to process %s.\n%s", module, err)
476
        return False
477
    pydocmd.render(modules)
478
    return True
479

480

481
def _py2md_wrapper(x: Tuple[str, str]) -> bool:
482
    """
483
    Used to wrap py2md since we can't pickle a lambda (needed for multiprocessing).
484
    """
485
    return py2md(x[0], x[1])
486

487

488
def parse_args():
489
    parser = argparse.ArgumentParser()
490
    parser.add_argument("modules", nargs="+", type=str, help="""The Python modules to parse.""")
491
    parser.add_argument(
492
        "-o",
493
        "--out",
494
        nargs="+",
495
        type=str,
496
        help="""Output files.
497
                If given, must have the same number of items as 'modules'.
498
                If not given, stdout is used.""",
499
    )
500
    return parser.parse_args()
501

502

503
def main():
504
    opts = parse_args()
505
    outputs = opts.out if opts.out else [None] * len(opts.modules)
506
    if len(outputs) != len(opts.modules):
507
        raise ValueError("Number inputs and outputs should be the same.")
508
    n_threads = cpu_count()
509
    errors: int = 0
510
    if len(opts.modules) > n_threads and opts.out:
511
        # If writing to files, can process in parallel.
512
        chunk_size = max([1, int(len(outputs) / n_threads)])
513
        logger.info("Using %d threads", n_threads)
514
        with Pool(n_threads) as p:
515
            for result in p.imap(_py2md_wrapper, zip(opts.modules, outputs), chunk_size):
516
                if not result:
517
                    errors += 1
518
    else:
519
        # If writing to stdout, need to process sequentially. Otherwise the output
520
        # could get intertwined.
521
        for module, out in zip(opts.modules, outputs):
522
            result = py2md(module, out)
523
            if not result:
524
                errors += 1
525
    logger.info("Processed %d modules", len(opts.modules))
526
    if errors:
527
        logger.error("Found %d errors", errors)
528
        sys.exit(1)
529

530

531
if __name__ == "__main__":
532
    main()
533
allennlp

Использование cookies