4
Turn docstrings from a single module into a markdown file.
6
We do this with PydocMarkdown, using custom processors and renderers defined here.
14
from collections import OrderedDict
15
from dataclasses import dataclass
17
from multiprocessing import Pool, cpu_count
18
from pathlib import Path
19
from typing import List, Optional, Tuple
22
from docspec import ApiObject, Class, Data, Function, Indirection, Module
23
from docspec_python import format_arglist
24
from pydoc_markdown import Processor, PydocMarkdown, Resolver
25
from pydoc_markdown.contrib.loaders.python import PythonLoader
26
from pydoc_markdown.contrib.renderers.markdown import MarkdownRenderer
28
logging.basicConfig(level=logging.INFO)
29
logger = logging.getLogger("py2md")
30
BASE_MODULE = os.environ.get("BASE_MODULE", "allennlp")
31
BASE_SOURCE_LINK = os.environ.get(
32
"BASE_SOURCE_LINK", "https://github.com/allenai/allennlp/blob/main/allennlp/"
36
class DocstringError(Exception):
40
def emphasize(s: str) -> str:
42
s = s.replace("_", "\\_")
47
ARGUMENTS = "ARGUMENTS"
48
PARAMETERS = "PARAMETERS"
49
ATTRIBUTES = "ATTRIBUTES"
57
def from_str(cls, section: str) -> "Section":
58
section = section.upper()
60
if section == member.value:
65
REQUIRED_PARAM_RE = re.compile(r"^`([^`]+)`(, required\.?)?$")
67
OPTIONAL_PARAM_RE = re.compile(
68
r"^`([^`]+)`,?\s+(optional,?\s)?\(\s?(optional,\s)?default\s?=\s?`([^`]+)`\s?\)\.?$"
71
OPTIONAL_PARAM_NO_DEFAULT_RE = re.compile(r"^`([^`]+)`,?\s+optional\.?$")
77
ty: Optional[str] = None
78
required: bool = False
79
default: Optional[str] = None
82
def from_line(cls, line: str) -> Optional["Param"]:
86
ident, description = line.split(":", 1)
88
description = description.strip()
93
maybe_match = REQUIRED_PARAM_RE.match(description)
95
ty = maybe_match.group(1)
96
return cls(ident=ident, ty=ty, required=True)
98
maybe_match = OPTIONAL_PARAM_RE.match(description)
100
ty = maybe_match.group(1)
101
default = maybe_match.group(4)
102
return cls(ident=ident, ty=ty, required=False, default=default)
104
maybe_match = OPTIONAL_PARAM_NO_DEFAULT_RE.match(description)
106
ty = maybe_match.group(1)
107
return cls(ident=ident, ty=ty, required=False)
109
raise DocstringError(
110
f"Invalid parameter / attribute description: '{line}'\n"
111
"Make sure types are enclosed in backticks.\n"
112
"Required parameters should be documented like: '{ident} : `{type}`'\n"
113
"Optional parameters should be documented like: '{ident} : `{type}`, optional (default = `{expr}`)'\n"
116
def to_line(self) -> str:
117
line: str = f"- {emphasize(self.ident)} :"
119
line += f" `{self.ty}`"
120
if not self.required:
123
line += f" (default = `{self.default}`)"
134
description: Optional[str] = None
135
ident: Optional[str] = None
136
ty: Optional[str] = None
139
def from_line(cls, line: str) -> "RetVal":
141
return cls(description=line)
142
ident, ty = line.split(":", 1)
143
ident = ident.strip()
145
if ty and not ty.startswith("`"):
146
raise DocstringError(f"Type should be enclosed in backticks: '{line}'")
147
return cls(ident=ident, ty=ty)
149
def to_line(self) -> str:
151
line = f"- {self.description} <br>"
153
line = f"- {emphasize(self.ident)}"
155
line += f" : {self.ty} <br>"
159
raise DocstringError("RetVal must have either description or ident")
165
parameters: "OrderedDict[str, Param]"
166
current_section: Optional[Section] = None
167
codeblock_opened: bool = False
168
consecutive_blank_line_count: int = 0
172
class AllenNlpDocstringProcessor(Processor):
174
Use to turn our docstrings into Markdown.
177
CROSS_REF_RE = re.compile("(:(class|func|mod):`~?([a-zA-Z0-9_.]+)`)")
178
UNDERSCORE_HEADER_RE = re.compile(r"(.*)\n-{3,}\n")
179
MULTI_LINE_LINK_RE = re.compile(r"(\[[^\]]+\])\n\s*(\([^\)]+\))")
181
def process(self, modules: List[Module], resolver: Optional[Resolver]) -> None:
182
docspec.visit(modules, self.process_node)
184
def process_node(self, node: docspec.ApiObject):
185
if not getattr(node, "docstring", None):
188
lines: List[str] = []
189
state: ProcessorState = ProcessorState(parameters=OrderedDict())
191
docstring = node.docstring
194
docstring = self.UNDERSCORE_HEADER_RE.sub(r"# \g<1>", docstring)
198
docstring = self.MULTI_LINE_LINK_RE.sub(r"\g<1>\g<2>", docstring)
200
for line in docstring.split("\n"):
202
if line.startswith("```"):
203
state.codeblock_opened = not state.codeblock_opened
205
if not state.codeblock_opened:
208
state.consecutive_blank_line_count += 1
209
if state.consecutive_blank_line_count >= 2:
210
state.current_section = None
212
state.consecutive_blank_line_count = 0
213
line = self._preprocess_line(node, line, state)
218
node.docstring = "\n".join(lines)
220
def _preprocess_line(self, node, line, state: ProcessorState) -> str:
221
match = re.match(r"#+ (.*)$", line)
223
state.current_section = Section.from_str(match.group(1).strip())
224
name = match.group(1).strip()
225
slug = (node.name + "." + match.group(1).strip()).lower().replace(" ", "_")
226
line = f'<h4 id="{slug}">{name}<a class="headerlink" href="#{slug}" title="Permanent link">¶</a></h4>\n'
228
if line and not line.startswith(" ") and not line.startswith("!!! "):
229
if state.current_section in (
233
param = Param.from_line(line)
235
line = param.to_line()
236
elif state.current_section in (Section.ATTRIBUTES, Section.MEMBERS):
237
attrib = Attrib.from_line(line)
239
line = attrib.to_line()
240
elif state.current_section in (Section.RETURNS, Section.RAISES):
241
retval = RetVal.from_line(line)
242
line = retval.to_line()
244
line = self._transform_cross_references(line)
248
def _transform_cross_references(self, line: str) -> str:
250
Replace sphinx style crossreferences with markdown links.
252
for match, ty, name in self.CROSS_REF_RE.findall(line):
253
if name.startswith(f"{BASE_MODULE}."):
254
path = name.split(".")
256
href = "/api/" + "/".join(path[1:])
258
href = "/api/" + "/".join(path[1:-1]) + "/#" + path[-1].lower()
259
cross_ref = f"[`{path[-1]}`]({href})"
260
elif "." not in name:
261
cross_ref = f"[`{name}`](#{name.lower()})"
263
cross_ref = f"`{name}`"
264
line = line.replace(match, cross_ref)
269
class AllenNlpFilterProcessor(Processor):
271
Used to filter out nodes that we don't want to document.
274
PRIVATE_METHODS_TO_KEEP = {
275
"DatasetReader._read",
279
"InfluenceInterpreter._calculate_influence_scores",
280
"TransformerModule._from_config",
281
"TransformerModule._pretrained_mapping",
282
"TransformerModule._pretrained_relevant_module",
283
"TransformerModule._pretrained_ignore",
284
"TransformerModule._pretrained_allow_missing",
285
"TransformerModule._distributed_loading_strategy",
286
"Constraint._update_state",
287
"Module._post_load_state_dict",
290
def process(self, modules: List[Module], resolver: Optional[Resolver]) -> None:
291
docspec.filter_visit(modules, self._check)
293
def _check(self, node: ApiObject) -> bool:
294
if node.name.startswith("_"):
295
if node.name in self.PRIVATE_METHODS_TO_KEEP:
297
if node.parent and f"{node.parent.name}.{node.name}" in self.PRIVATE_METHODS_TO_KEEP:
300
if node.parent and node.parent.name.startswith("_"):
302
if node.name == "logger" and isinstance(node.parent, Module):
307
class AllenNlpRenderer(MarkdownRenderer):
308
def _format_function_signature(
311
override_name: str = None,
312
add_method_bar: bool = True,
313
include_parent_class: bool = True,
316
for dec in func.decorations:
317
parts.append("@{}{}\n".format(dec.name, dec.args or ""))
318
if func.modifiers and "async" in func.modifiers:
319
parts.append("async ")
320
if self.signature_with_def:
322
if self.signature_class_prefix and (
323
func.is_function() and func.parent and func.parent.is_class()
325
parts.append(func.parent.name + ".")
326
parts.append((override_name or func.name))
327
signature_args = format_arglist(func.args)
328
if signature_args.endswith(","):
329
signature_args = signature_args[:-1].strip()
333
+ len(signature_args)
334
+ (0 if not func.return_type else len(str(func.return_type)))
337
signature_args = ",\n ".join(
339
lambda s: s.strip() not in ("", ","),
340
(format_arglist([arg]) for arg in func.args),
343
parts.append("(\n " + signature_args + "\n)")
345
parts.append("(" + signature_args + ")")
348
parts.append(" -> {}".format(func.return_type))
349
result = "".join(parts)
350
if add_method_bar and isinstance(func.parent, Class):
351
result = "\n".join(" | " + line for line in result.split("\n"))
352
if include_parent_class:
353
bases = ", ".join(map(str, func.parent.bases))
354
if func.parent.metaclass:
355
bases += ", metaclass=" + str(func.parent.metaclass)
357
class_signature = f"class {func.parent.name}({bases})"
359
class_signature = f"class {func.parent.name}"
360
result = f"{class_signature}:\n | ...\n{result}"
363
def _format_data_signature(self, data: Data) -> str:
365
if expr and len(expr) > self.data_expression_maxlength:
366
expr = expr[: self.data_expression_maxlength] + " ..."
369
signature = f"{data.name}: {data.datatype} = {expr}"
371
signature = f"{data.name} = {expr}"
373
if data.parent and isinstance(data.parent, Class):
374
bases = ", ".join(map(str, data.parent.bases))
375
if data.parent.metaclass:
376
bases += ", metaclass=" + str(data.parent.metaclass)
378
class_signature = f"class {data.parent.name}({bases})"
380
class_signature = f"class {data.parent.name}"
381
return f"{class_signature}:\n | ...\n | {signature}"
385
def _format_classdef_signature(self, cls: Class) -> str:
388
for dec in cls.decorations:
389
code += "@{}{}\n".format(dec.name, dec.args or "")
390
bases = ", ".join(map(str, cls.bases))
392
bases += ", metaclass=" + str(cls.metaclass)
394
code += "class {}({})".format(cls.name, bases)
396
code += "class {}".format(cls.name)
397
if self.signature_python_help_style:
398
code = cls.path() + " = " + code
399
members = {m.name: m for m in cls.members}
400
if self.classdef_render_init_signature_if_needed and ("__init__" in members):
401
code += ":\n" + self._format_function_signature(
404
include_parent_class=False,
408
def _render_module_breadcrumbs(self, fp, mod: Module):
409
submods = mod.name.split(".")
411
for i, submod_name in enumerate(submods):
413
title = f"<i>{submod_name}</i>"
414
elif i == len(submods) - 1:
415
title = f"<strong>.{submod_name}</strong>"
417
title = f"<i>.{submod_name}</i>"
418
breadcrumbs.append(title)
419
"/".join(submods[1:])
420
source_link = BASE_SOURCE_LINK + "/".join(submods[1:]) + ".py"
423
' <p class="alignleft">' + "".join(breadcrumbs) + "</p>\n"
424
f' <p class="alignright"><a class="sourcelink" href="{source_link}">[SOURCE]</a></p>\n'
426
'<div style="clear: both;"></div>\n\n---\n\n'
429
def _render_object(self, fp, level, obj):
430
if isinstance(obj, Indirection) or isinstance(obj, Function) and obj.name == "__init__":
432
if not isinstance(obj, Module) or self.render_module_header:
433
self._render_header(fp, level, obj)
434
if isinstance(obj, Module):
435
self._render_module_breadcrumbs(fp, obj)
436
self._render_signature_block(fp, obj)
438
lines = obj.docstring.split("\n")
439
if self.docstrings_as_blockquote:
440
lines = ["> " + x for x in lines]
441
fp.write("\n".join(lines))
445
def py2md(module: str, out: Optional[str] = None) -> bool:
447
Returns `True` if module successfully processed, otherwise `False`.
449
logger.debug("Processing %s", module)
450
pydocmd = PydocMarkdown(
451
loaders=[PythonLoader(modules=[module])],
452
processors=[AllenNlpFilterProcessor(), AllenNlpDocstringProcessor()],
453
renderer=AllenNlpRenderer(
455
add_method_class_prefix=False,
456
add_member_class_prefix=False,
457
data_code_block=True,
458
signature_with_def=True,
459
signature_with_vertical_bar=True,
460
use_fixed_header_levels=False,
461
render_module_header=False,
462
descriptive_class_title=False,
463
classdef_with_decorators=True,
464
classdef_render_init_signature_if_needed=True,
469
os.makedirs(out_path.parent, exist_ok=True)
471
modules = pydocmd.load_modules()
473
pydocmd.process(modules)
474
except DocstringError as err:
475
logger.exception("Failed to process %s.\n%s", module, err)
477
pydocmd.render(modules)
481
def _py2md_wrapper(x: Tuple[str, str]) -> bool:
483
Used to wrap py2md since we can't pickle a lambda (needed for multiprocessing).
485
return py2md(x[0], x[1])
489
parser = argparse.ArgumentParser()
490
parser.add_argument("modules", nargs="+", type=str, help="""The Python modules to parse.""")
496
help="""Output files.
497
If given, must have the same number of items as 'modules'.
498
If not given, stdout is used.""",
500
return parser.parse_args()
505
outputs = opts.out if opts.out else [None] * len(opts.modules)
506
if len(outputs) != len(opts.modules):
507
raise ValueError("Number inputs and outputs should be the same.")
508
n_threads = cpu_count()
510
if len(opts.modules) > n_threads and opts.out:
512
chunk_size = max([1, int(len(outputs) / n_threads)])
513
logger.info("Using %d threads", n_threads)
514
with Pool(n_threads) as p:
515
for result in p.imap(_py2md_wrapper, zip(opts.modules, outputs), chunk_size):
521
for module, out in zip(opts.modules, outputs):
522
result = py2md(module, out)
525
logger.info("Processed %d modules", len(opts.modules))
527
logger.error("Found %d errors", errors)
531
if __name__ == "__main__":