calminer/scripts/format_docs_md.py

"""Lightweight Markdown formatter: normalizes first-line H1, adds code-fence language hints for common shebangs, trims trailing whitespace.

This is intentionally small and non-destructive; it touches only files under docs/ and makes safe changes.
"""
import re
from pathlib import Path

DOCS = Path(__file__).resolve().parents[1] / "docs"

CODE_LANG_HINTS = {
    'powershell': ('powershell',),
    'bash': ('bash', 'sh'),
    'sql': ('sql',),
    'python': ('python',),
}


def add_code_fence_language(match):
    fence = match.group(0)
    inner = match.group(1)
    # If language already present, return unchanged
    if fence.startswith('```') and len(fence.splitlines()[0].strip()) > 3:
        return fence
    # Try to infer language from the code content
    code = inner.strip().splitlines()[0] if inner.strip() else ''
    lang = ''
    if code.startswith('$') or code.startswith('PS') or code.lower().startswith('powershell'):
        lang = 'powershell'
    elif code.startswith('#') or code.startswith('import') or code.startswith('from'):
        lang = 'python'
    elif re.match(r'^(select|insert|update|create)\b', code.strip(), re.I):
        lang = 'sql'
    elif code.startswith('git') or code.startswith('./') or code.startswith('sudo'):
        lang = 'bash'
    if lang:
        return f'```{lang}\n{inner}\n```'
    return fence


def normalize_file(path: Path):
    text = path.read_text(encoding='utf-8')
    orig = text
    # Trim trailing whitespace and ensure single trailing newline
    text = '\n'.join(line.rstrip() for line in text.splitlines()) + '\n'
    # Ensure first non-empty line is H1
    lines = text.splitlines()
    for i, ln in enumerate(lines):
        if ln.strip():
            if not ln.startswith('#'):
                lines[i] = '# ' + ln
            break
    text = '\n'.join(lines) + '\n'
    # Add basic code fence languages where missing (simple heuristic)
    text = re.sub(r'```\n([\s\S]*?)\n```', add_code_fence_language, text)
    if text != orig:
        path.write_text(text, encoding='utf-8')
        return True
    return False


def main():
    changed = []
    for p in DOCS.rglob('*.md'):
        if p.is_file():
            try:
                if normalize_file(p):
                    changed.append(str(p.relative_to(Path.cwd())))
            except Exception as e:
                print(f"Failed to format {p}: {e}")
    if changed:
        print('Formatted files:')
        for c in changed:
            print(' -', c)
    else:
        print('No formatting changes required.')


if __name__ == '__main__':
    main()