generate-wandb-python-reference/mintlify_workspaces_report_docs.py at main · ngrayluna/generate-wandb-python-reference · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/bin/usr/python
import os
import re
import glob
import argparse

def remove_images(markdown_text):
    """Remove all <img> tags from the markdown text."""
    img_pattern = r'<img[^>]*>'
    cleaned_text = re.sub(img_pattern, '', markdown_text).strip()
    return cleaned_text

def remove_markdownlint_disable(markdown_text):
    """Remove markdownlint-disable HTML comments from the markdown text."""
    pattern = r'<!--\s*markdownlint-disable\s*-->'
    cleaned_text = re.sub(pattern, '', markdown_text).strip()
    return cleaned_text


def remove_module_header(markdown_text):
    """Remove the module header line (e.g., '# <kbd>module</kbd> `module.name`')."""
    pattern = r'#\s*<kbd>module</kbd>\s*`[^`]*`\s*\n?'
    cleaned_text = re.sub(pattern, '', markdown_text).strip()
    return cleaned_text


def remove_internal_classes(markdown_text):
    """Remove class definitions marked as INTERNAL.

    Matches patterns like:
        ## <kbd>class</kbd> `ClassName`
        INTERNAL: This class is not for public use.
        ---

    Handles trailing whitespace and optional --- separator.
    """
    pattern = r'#{1,}\s*<kbd>class</kbd>\s*`[^`]*`\s*\nINTERNAL:[^\n]*[\s\n]*(?:---)?'
    cleaned_text = re.sub(pattern, '', markdown_text).strip()
    return cleaned_text


def remove_empty_lines(markdown_text):
    """Remove empty lines from the markdown text."""
    cleaned_text = re.sub(r'\n\s*\n', '\n', markdown_text).strip()
    return cleaned_text


def strip_trailing_whitespace(text):
    """Strip trailing whitespace (spaces/tabs) from each line in the text."""
    return re.sub(r'[ \t]+$', '', text, flags=re.MULTILINE)


def rename_markdown_file(old_filename, output_directory="."):
    """
    Rename the markdown file from old_filename to new_filename.
    """
    base_name = os.path.basename(old_filename).split('.')[1]
    os.rename(old_filename, os.path.join(output_directory, base_name + ".mdx"))

def _markdown_title(filename):
    """
    Create markdown title based on the filename read in.
    """
    base_name = os.path.basename(filename).split('.')[1].capitalize()
    return f"title: {base_name}\n"


def add_frontmatter(filename):
    """Add frontmatter to the markdown file.

    Args:
        filename (str): Name of the file.
    """
    return "---\n" + _markdown_title(filename) +  "---\n"


def add_github_import_statement():
    """Add GitHub import statement to the markdown file.

    Args:
        filename (str): Name of the file.
    """
    return "import { GitHubLink } from '/snippets/en/_includes/github-source-link.mdx';" + "\n\n"

def format_github_button(filename, base_url="https://github.com/wandb/wandb-workspaces/blob/main/wandb_workspaces/"):
    """Add GitHub button to the markdown file.

    Args:
        filename (str): Name of the file.
        base_url (str): Base URL for the GitHub button.
    """

    name = os.path.basename(filename).split('.')[1]
    if "reports" in name:
        version = os.path.basename(filename).split('.')[2]
        href_links = os.path.join(base_url, name + "/" + version + "/internal.py")
    else:
        href_links = os.path.join(base_url, name + "/internal.py")
    return _github_button(href_links)

def _github_button(href_links):
    """To do: Add hugo scripting to add this function. For now, just add code line # for debugging.

    Args:
        href_links (str): URL for the GitHub button.
    """
    return '<GitHubLink url="' + href_links + '" />' + "\n\n"

def add_public_preview_note():
    """Add admonition markdown to the markdown file."""
    note = (
        "<Note>\nW&B Report and Workspace API is in Public Preview.\n</Note>\n\n"
    )
    return note


def alphabetize_headings(markdown_text):
    """Alphabetize the classes, etc. in the markdown file."""
    # Split the text into two parts: the module docstring (before the first "---") and the rest
    parts = markdown_text.split('---', 1)

    # If there is content before the first "---", treat it as the module docstring
    if len(parts) > 1:
        docstring = parts[0].strip()  # The module docstring
        rest_of_content = '---' + parts[1]  # The remaining content starting with the first ---
    else:
        # If no separator found, assume everything is the module docstring
        docstring = markdown_text.strip()
        rest_of_content = ""

    # Split the rest of the content into blocks based on the "---" separator
    blocks = re.split(r'(?=---)', rest_of_content)

    sections = []

    # Pattern to match H2 headings (classes)
    h2_pattern = re.compile(r'## <kbd>class</kbd> `([^`]+)`')

    current_section = None

    # Iterate over each block to find H2 headings and group content, including H3
    for block in blocks:
        h2_match = h2_pattern.search(block)
        if h2_match:
            # Extract the class name from the H2 heading
            class_name = h2_match.group(1)
            if current_section:
                sections.append(current_section)
            # Start a new section with the current block as content
            current_section = (class_name, block)
        elif current_section:
            # Append the block content to the current section
            current_section = (current_section[0], current_section[1] + block)

    # Append the last section
    if current_section:
        sections.append(current_section)

    # Sort the sections alphabetically by the class name
    sections.sort(key=lambda x: x[0])

    # Reconstruct the markdown text with the docstring followed by the sorted sections
    sorted_markdown = docstring + "\n\n" + "\n\n".join([section[1] for section in sections])

    return sorted_markdown


def main(args):

    # Read input markdown file
    for filename in glob.glob(os.path.join(os.getcwd(), args.markdown_directory, "*.md")):

        # Process each markdown file
        print("Processing...", filename)
        with open(filename, "r+") as file:

            # Process markdown text
            markdown_text = file.read()
            markdown_text = remove_markdownlint_disable(markdown_text)
            markdown_text = remove_module_header(markdown_text)
            markdown_text = remove_images(markdown_text)
            markdown_text = remove_internal_classes(markdown_text)
            # markdown_text = remove_empty_lines(markdown_text)
            markdown_text = alphabetize_headings(markdown_text)
            markdown_text = strip_trailing_whitespace(markdown_text)

            # Write back to the file with frontmatter and GitHub import statement
            file.seek(0)
            file.write(add_frontmatter(filename))
            file.write(add_github_import_statement())
            file.write(format_github_button(filename))
            file.write(add_public_preview_note())
            file.write(markdown_text)
            file.truncate()

        # Rename markdown file and change extension to .mdx
        rename_markdown_file(filename, output_directory=args.markdown_directory)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Add frontmatter and GitHub link to markdown files.")
    parser.add_argument("--markdown_directory", default="wandb_sdk_docs",
                        help="Directory containing markdown files to process")
    main(parser.parse_args())