generate-wandb-python-reference/process_sdk_markdown.py at main · ngrayluna/generate-wandb-python-reference · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python
"""
Enhanced script to remove entire classes, methods, functions, optionally `__init__`
methods, **and individual attribute bullets** flagged with
    <!-- lazydoc-ignore-class-attributes -->
from lazydocs‑generated markdown.
"""
import os
import re
import argparse
import glob
from typing import List, Tuple


class MarkdownCleaner:
    # ------------------------------------------------------------------ #
    def __init__(self):
        # 1) simple one‑off replacements
        self.patterns: List[Tuple[re.Pattern, str]] = [
            (re.compile(r'<a\b[^>]*>(.*?)</a>', re.DOTALL), r'\1'),
            (re.compile(r'(# <kbd>module</kbd> `[\w\.]+)\.[\w]+`'), r'\1`'),
            (re.compile(
                r"\*\*Global Variables\*\*\n[-]+\n(?:(?!## |# <kbd>)[\s\S])*\n",
                re.MULTILINE
            ), ""),
            (re.compile(r'<b>(.*?)</b>'), r'\1'),
            (re.compile(
                r'---\n+_This file was automatically generated via '
                r'\[lazydocs\]\([^)]+\)._\n*'
            ), ""),
            (re.compile(r'####\s*'), r'### '),
        ]

        # 2) existing large‑block patterns
        self.block_pattern = re.compile(
            r"(?s)(## <kbd>class</kbd> `.*?`|"
            r"### <kbd>(?:method|function)</kbd> `.*?`\n\n```python\n.*?\n```\n\n.*?|"
            r"### <kbd>property</kbd> .*?\n\n.*?)(?=\n## |\n### |\Z)"
        )
        self.class_pattern = re.compile(r"(?s)## <kbd>class</kbd> `.*?`.*?(?=\n## <kbd>class</kbd>|$)")
        self.function_pattern = re.compile(r"(?s)## <kbd>function</kbd> `.*?`\n\n```python\n.*?\n```\n\n.*?(?=\n## |\Z)")
        self.init_pattern = re.compile(
            r"(?s)<!-- lazydoc-ignore-init: internal -->\s*"
            r"### <kbd>method</kbd> `.*?__init__.*?`\n\n```python\n.*?\n```\n\n.*?(?=\n## |\n### |\Z)"
        )
        self.classmethod_pattern = re.compile(
            r"(?s)### <kbd>classmethod</kbd> `.*?`\n\n```python\n.*?\n```\n\n.*?(?=\n## |\n### |\Z)"
        )

        # 3) what a single attribute bullet‑block looks like
        self.attr_block_pattern = re.compile(
            r"(?sm)^( {0,3}- .*?)"            # top‑level bullet start …
            r"(?=\n {0,3}- |\n## |\n### |\Z)" # … up to next bullet/header/EOF
        )

    # ------------------------------------------------------------------ #
    def clean_text(self, markdown_text: str) -> str:
        cleaned = markdown_text

        # -- simple substitutions
        for pat, repl in self.patterns:
            cleaned = pat.sub(repl, cleaned)

        # -- your original ignore markers
        cleaned = self._remove_ignored_blocks(cleaned, "<!-- lazydoc-ignore: internal -->",     self.block_pattern)
        cleaned = self._remove_ignored_blocks(cleaned, "<!-- lazydoc-ignore-class: internal -->", self.class_pattern)
        cleaned = self._remove_ignored_blocks(cleaned, "<!-- lazydoc-ignore-function: internal -->", self.function_pattern)
        cleaned = self._remove_ignored_blocks(cleaned, "<!-- lazydoc-ignore-classmethod: internal -->", self.classmethod_pattern)
        cleaned = self.init_pattern.sub("", cleaned)

        # -- NEW: attribute bullets flagged with the inline literal
        cleaned = self._remove_ignored_blocks(
            cleaned,
            "<!-- lazydoc-ignore-class-attributes -->",
            self.attr_block_pattern
        )

        # -- Move __init__ method before Args section in class documentation
        cleaned = self._move_init_before_args(cleaned)

        return cleaned

    # ------------------------------------------------------------------ #
    def _remove_ignored_blocks(self, text: str, token: str, pattern: re.Pattern) -> str:
        """Drop any regex‑matched block that contains the given token."""
        def keep_or_drop(match: re.Match) -> str:
            return "" if token in match.group(0) else match.group(0)
        return pattern.sub(keep_or_drop, text)

    # ------------------------------------------------------------------ #
    def _move_init_before_args(self, text: str) -> str:
        """
        Move __init__ method blocks to appear before Args sections in class documentation.

        This finds patterns where a class has an Args section followed by an __init__ method,
        and reorganizes them so the __init__ appears immediately after the class description
        and before the Args section.
        """
        # Pattern to match a class section with Args before __init__
        class_with_args_pattern = re.compile(
            r'(## <kbd>class</kbd> `[^`]+`[\s\S]*?)'  # Class header and description (non-greedy)
            r'(\n\*\*Args:\*\*[\s\S]*?)'               # Args section (non-greedy)
            r'(\n\*\*Returns:\*\*[\s\S]*?)?'           # Optional Returns section (non-greedy)
            r'(\n### <kbd>method</kbd> `[^`]*__init__[^`]*`\n+```python\n__init__\([\s\S]*?\n```)',  # __init__ method
            re.MULTILINE | re.DOTALL
        )

        def reorder_match(match):
            groups = match.groups()
            class_header = groups[0].rstrip()  # Remove trailing whitespace
            args_section = groups[1] if groups[1] else ""
            returns_section = groups[2] if groups[2] else ""
            init_method = groups[3] if groups[3] else ""

            # Reorganize: class header + init method + args + returns
            if init_method:
                # Add proper spacing
                result = class_header + "\n"
                result += init_method + "\n"
                result += args_section
                if returns_section:
                    result += returns_section
                return result
            return match.group(0)

        # Apply the reorganization
        text = class_with_args_pattern.sub(reorder_match, text)

        return text


# ----------------------------------------------------------------------#
def process_text(markdown_text: str) -> str:
    return MarkdownCleaner().clean_text(markdown_text)


def main(args):
    for filename in glob.glob(os.path.join(os.getcwd(), args.markdown_directory, "*.md")):
        print("Reading in...", filename)
        with open(filename, "r") as f:
            text = f.read()
        with open(filename, "w") as f:
            f.write(process_text(text))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Post‑process lazydocs markdown.")
    parser.add_argument("--markdown_directory", default="wandb_sdk_docs",
                        help="Directory containing markdown files to process")
    main(parser.parse_args())