allucanget.biz/lib/markdown_parser.py

"""
Markdown parser for converting Markdown files into structured component data.

This module reads Markdown files and returns a structured representation that maps
heading levels to component types:
  - H1 (#) -> page title / hero
  - H2 (##) -> major sections
  - H3 (###) -> cards or subsections within sections
  - Lists -> converted to component-compatible format
"""

import os
import re
import textwrap
import markdown
from bs4 import BeautifulSoup
from markdown.treeprocessors import Treeprocessor
from markdown.preprocessors import Preprocessor
from markdown.extensions import Extension
from typing import Dict, List, Any, Optional, cast
from lib.types import PageData, Section, Card, Detail, ParserState


class HeadingCollector(Treeprocessor):
    """
    Custom Markdown tree processor that collects headings and their content.
    """

    def __init__(self, md: Any) -> None:
        super().__init__(md)
        self.headings: List[Dict[str, Any]] = []
        self.current_content: List[str] = []
        self.main_intro: str = ''

    def run(self, root: Any) -> Any:
        """Process the element tree and collect headings with content."""
        self.headings = []
        self.main_intro = ''
        collecting_intro = False
        for element in root:
            if element.tag == 'h1':
                collecting_intro = True
            elif collecting_intro and element.tag in ['h2', 'h3', 'h4', 'h5', 'h6']:
                collecting_intro = False
            elif collecting_intro:
                intro_text = self._extract_text(element)
                if intro_text:
                    if self.main_intro:
                        self.main_intro += '\n\n' + intro_text
                    else:
                        self.main_intro = intro_text
            self._process_element(element)
        return root

    def _process_element(self, element: Any) -> None:
        """Recursively process elements to extract heading structure."""
        if element.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            # Convert heading tag to level (e.g., 'h2' -> 2)
            level = int(element.tag[1])
            text = self._extract_text(element)
            self.headings.append({
                'level': level if level > 1 else 2,  # Treat H1 as level 2 for sectioning
                'text': text,
                'tag': element.tag,
                'element': element,
            })
        elif element.tag in ['ul', 'ol']:
            # Extract list items
            items: List[str] = []
            for li in element:
                items.append(self._extract_text(li))
            self.headings.append({
                'type': element.tag,
                'items': items,
                'element': element,
            })
        else:
            # Process children
            for child in element:
                self._process_element(child)

    def _extract_text(self, element: Any) -> str:
        """Extract all text from an element and its children."""
        if element.text:
            text = element.text
        else:
            text = ''
        for child in element:
            text += self._extract_text(child)
            if child.tail:
                text += child.tail
        return text.strip()


class DedentPreprocessor(Preprocessor):
    """Normalize leading indentation so headings aren't treated as code blocks."""

    def run(self, lines: List[str]) -> List[str]:
        text = '\n'.join(lines)
        dedented = textwrap.dedent(text)
        return dedented.split('\n')


class HeadingExtension(Extension):
    """Markdown extension to collect headings."""

    def extendMarkdown(self, md: Any) -> None:
        md.preprocessors.register(
            DedentPreprocessor(md), 'dedent_preprocessor', 27)
        md.treeprocessors.register(
            HeadingCollector(md), 'heading_collector', 5)


def parse_markdown_file(file_path: str) -> PageData:
    """
    Parse a Markdown file and return a structured representation.

    Args:
        file_path (str): Path to the Markdown file to parse.

    Returns:
        dict: A nested dictionary representing the page structure.

    Raises:
        FileNotFoundError: If the file does not exist.
        IOError: If there is an error reading the file.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Markdown file not found: {file_path}")

    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Parse the content to extract structure
    return build_component_structure(content, file_path)


def get_markdown_files(docs_dir: str = 'docs/en') -> List[str]:
    """Return Markdown filenames from the given docs directory."""
    if not os.path.exists(docs_dir):
        return []
    return [f for f in os.listdir(docs_dir) if f.endswith('.md')]


def markdown_filename_to_html_filename(md_filename: str) -> str:
    """Convert a Markdown filename to its HTML counterpart."""
    return md_filename.replace('.md', '.html').lower()


def build_component_structure(
        markdown_content: str, file_path: str) -> PageData:
    """
    Build a nested component structure from Markdown content.

    This function parses Markdown headings and content into a hierarchical structure
    suitable for rendering with component templates.
    """
    lines = markdown_content.split('\n')
    page: PageData = {
        'title': None,
        'sections': [],
    }
    current_section: Optional[Section] = None
    current_card: Optional[Card] = None
    current_detail: Optional[Detail] = None
    content_buffer: List[str] = []
    detail_buffer: List[str] = []

    # Move local state into a dict so module-level helpers can operate on it
    state: ParserState = {
        'page': page,
        'current_section': current_section,
        'current_card': current_card,
        'current_detail': current_detail,
        'content_buffer': content_buffer,
        'detail_buffer': detail_buffer,
    }

    for line in lines:
        process_line_with_state(line, state)

    # Rehydrate locals from state
    current_section = state['current_section']
    current_card = state['current_card']
    current_detail = state['current_detail']
    content_buffer = state['content_buffer']
    detail_buffer = state['detail_buffer']

    # Flush remaining content
    flush_detail_buffer(state)
    # Flush content buffer to card or section as appropriate
    if current_card is not None and content_buffer:
        current_card['content'] = markdown_to_html_lines(
            '\n'.join(content_buffer).strip())
    elif current_section is not None and content_buffer:
        current_section['content'] = markdown_to_html_lines(
            '\n'.join(content_buffer).strip())

    if page['title'] is None:
        filename = os.path.basename(file_path)
        page['title'] = os.path.splitext(filename)[0].replace('_', ' ').title()

    return page


def flush_detail_buffer(state: ParserState) -> None:
    """Flush the detail buffer into the current detail entry."""
    current_detail = state['current_detail']
    detail_buffer = state['detail_buffer']
    if current_detail is not None and detail_buffer:
        current_detail['content'] = markdown_to_html_lines(
            '\n'.join(detail_buffer).strip())
        state['detail_buffer'] = []


def flush_content_buffer_to_card(state: ParserState) -> None:
    """Flush the content buffer into the current card."""
    current_card = state['current_card']
    content_buffer = state['content_buffer']
    if current_card is not None and content_buffer:
        current_card['content'] = markdown_to_html_lines(
            '\n'.join(content_buffer).strip())


def flush_content_buffer_to_section(state: ParserState) -> None:
    """Flush the content buffer into the current section."""
    current_section = state['current_section']
    content_buffer = state['content_buffer']
    if current_section is not None and content_buffer:
        current_section['content'] = markdown_to_html_lines(
            '\n'.join(content_buffer).strip())


def start_section(title: str, state: ParserState) -> None:
    """Start a new section with the given title."""
    if state['current_card'] is not None:
        flush_content_buffer_to_card(state)
        state['content_buffer'] = []
        state['current_card'] = None
    if state['current_section'] is not None:
        flush_content_buffer_to_section(state)
    section = cast(Section, {
        'title': title,
        'content': '',
        'cards': []
    })
    sections = state['page'].get('sections', [])
    sections.append(section)
    state['page']['sections'] = sections
    state['current_section'] = section
    state['content_buffer'] = []
    state['current_card'] = None


def start_card(title: str, state: ParserState) -> None:
    """Start a new card within the current section."""
    if state['current_section'] is None:
        return
    if state['current_card'] is None:
        flush_content_buffer_to_section(state)
    else:
        flush_content_buffer_to_card(state)
    card = cast(Card, {
        'title': title,
        'content': ''
    })
    state['current_section']['cards'].append(card)
    state['current_card'] = card
    state['content_buffer'] = []


def start_detail(title: str, state: ParserState) -> None:
    """Start a new detail within the current card."""
    if state['current_card'] is None:
        return
    if state['current_detail'] is not None and state['detail_buffer']:
        state['current_detail']['content'] = markdown_to_html_lines(
            '\n'.join(state['detail_buffer']).strip())
        state['detail_buffer'] = []
    detail = cast(Detail, {
        'title': title,
        'content': ''
    })
    if 'details' not in state['current_card']:
        state['current_card']['details'] = []
    state['current_card']['details'].append(detail)
    state['current_detail'] = detail


def process_line_with_state(line: str, state: ParserState) -> None:
    """Process a single markdown line, updating the provided state dict."""
    if line.startswith('# '):
        # H1 - page title
        state['page']['title'] = line[2:].strip()
        # intro content before sections
        if state['current_section'] is not None and state['content_buffer']:
            flush_content_buffer_to_section(state)
        state['current_section'] = None
    elif line.startswith('## '):
        # H2 - major section
        flush_detail_buffer(state)
        title = line[3:].strip()
        title = check_image_in_title(title)
        start_section(title, state)
    elif line.startswith('### '):
        # H3 - card or subsection
        flush_detail_buffer(state)
        title = line[4:].strip()
        title = check_image_in_title(title)
        start_card(title, state)
    elif line.startswith('#### '):
        # H4 - detail inside a card
        if state['current_card'] is not None:
            title = line[5:].strip()
            title = check_image_in_title(title)
            start_detail(title, state)
        else:
            state['content_buffer'].append(line)
    elif line.strip():
        if state['current_detail'] is not None:
            state['detail_buffer'].append(line)
        else:
            state['content_buffer'].append(line)


def check_image_in_title(title: str) -> str:
    """
    Check if there is an image in the title, preserve original title text and create HTML img tag with alt text and src.
    """
    img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
    match = re.search(img_pattern, title)
    if match:
        alt_text = match.group(1).strip()
        src = match.group(2).strip()
        img_tag = f'<img src="{src}" alt="{alt_text}"/>'
        title = re.sub(img_pattern, img_tag, title).strip()
    return title


def markdown_to_html_lines(text: str) -> str:
    """
    Convert Markdown text to HTML.
    """
    if not text:
        return ''

    md = markdown.Markdown()
    html = md.convert(text)

    def _is_unsafe(href: str) -> bool:
        lower = href.strip().lower()
        return lower.startswith('javascript:') or lower.startswith(
            'data:') or lower.startswith('vbscript:')

    soup = BeautifulSoup(html, 'html.parser')

    for anchor in soup.find_all('a'):
        href = anchor.get('href')
        if isinstance(href, (list, tuple)):
            href = href[0]
        href = (href or '').strip()
        if not href:
            continue
        if _is_unsafe(href):
            anchor['href'] = '#unsafe'
            anchor.attrs.pop('target', None)
            anchor.attrs.pop('rel', None)
            continue
        if href.startswith('http://') or href.startswith('https://'):
            anchor['target'] = '_blank'
            anchor['rel'] = 'noopener noreferrer'

    for image in soup.find_all('img'):
        src = image.get('src') or ''
        if isinstance(src, (list, tuple)):
            src = src[0]
        src = src.strip()
        if not src:
            image.decompose()
            continue

        alt = image.get('alt')
        if isinstance(alt, (list, tuple)):
            alt = alt[0]
        alt_text = (alt or '').strip()
        # Determine final src for relative paths
        if not (src.startswith('http://') or src.startswith('https://')
                or src.startswith('/') or src.startswith('img/')):
            filename = os.path.basename(src)
            src = f'img/{filename}' if filename else src
        image['src'] = src

        if not alt_text:
            alt_text = os.path.splitext(os.path.basename(src))[
                0].replace('-', ' ').replace('_', ' ').strip()
        image['alt'] = alt_text

    return str(soup)