""" Markdown parser for converting Markdown files into structured component data. This module reads Markdown files and returns a structured representation that maps heading levels to component types: - H1 (#) -> page title / hero - H2 (##) -> major sections - H3 (###) -> cards or subsections within sections - Lists -> converted to component-compatible format """ import os import re import textwrap import markdown from bs4 import BeautifulSoup from markdown.treeprocessors import Treeprocessor from markdown.preprocessors import Preprocessor from markdown.extensions import Extension from typing import Dict, List, Any, Optional, cast from lib.types import PageData, Section, Card, Detail, ParserState class HeadingCollector(Treeprocessor): """ Custom Markdown tree processor that collects headings and their content. """ def __init__(self, md: Any) -> None: super().__init__(md) self.headings: List[Dict[str, Any]] = [] self.current_content: List[str] = [] self.main_intro: str = '' def run(self, root: Any) -> Any: """Process the element tree and collect headings with content.""" self.headings = [] self.main_intro = '' collecting_intro = False for element in root: if element.tag == 'h1': collecting_intro = True elif collecting_intro and element.tag in ['h2', 'h3', 'h4', 'h5', 'h6']: collecting_intro = False elif collecting_intro: intro_text = self._extract_text(element) if intro_text: if self.main_intro: self.main_intro += '\n\n' + intro_text else: self.main_intro = intro_text self._process_element(element) return root def _process_element(self, element: Any) -> None: """Recursively process elements to extract heading structure.""" if element.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: # Convert heading tag to level (e.g., 'h2' -> 2) level = int(element.tag[1]) text = self._extract_text(element) self.headings.append({ 'level': level if level > 1 else 2, # Treat H1 as level 2 for sectioning 'text': text, 'tag': element.tag, 'element': element, }) elif element.tag in ['ul', 'ol']: # Extract list items items: List[str] = [] for li in element: items.append(self._extract_text(li)) self.headings.append({ 'type': element.tag, 'items': items, 'element': element, }) else: # Process children for child in element: self._process_element(child) def _extract_text(self, element: Any) -> str: """Extract all text from an element and its children.""" if element.text: text = element.text else: text = '' for child in element: text += self._extract_text(child) if child.tail: text += child.tail return text.strip() class DedentPreprocessor(Preprocessor): """Normalize leading indentation so headings aren't treated as code blocks.""" def run(self, lines: List[str]) -> List[str]: text = '\n'.join(lines) dedented = textwrap.dedent(text) return dedented.split('\n') class HeadingExtension(Extension): """Markdown extension to collect headings.""" def extendMarkdown(self, md: Any) -> None: md.preprocessors.register( DedentPreprocessor(md), 'dedent_preprocessor', 27) md.treeprocessors.register( HeadingCollector(md), 'heading_collector', 5) def parse_markdown_file(file_path: str) -> PageData: """ Parse a Markdown file and return a structured representation. Args: file_path (str): Path to the Markdown file to parse. Returns: dict: A nested dictionary representing the page structure. Raises: FileNotFoundError: If the file does not exist. IOError: If there is an error reading the file. """ if not os.path.exists(file_path): raise FileNotFoundError(f"Markdown file not found: {file_path}") with open(file_path, 'r', encoding='utf-8') as f: content = f.read() # Parse the content to extract structure return build_component_structure(content, file_path) def get_markdown_files(docs_dir: str = 'docs/en') -> List[str]: """Return Markdown filenames from the given docs directory.""" if not os.path.exists(docs_dir): return [] return [f for f in os.listdir(docs_dir) if f.endswith('.md')] def markdown_filename_to_html_filename(md_filename: str) -> str: """Convert a Markdown filename to its HTML counterpart.""" return md_filename.replace('.md', '.html').lower() def build_component_structure( markdown_content: str, file_path: str) -> PageData: """ Build a nested component structure from Markdown content. This function parses Markdown headings and content into a hierarchical structure suitable for rendering with component templates. """ lines = markdown_content.split('\n') page: PageData = { 'title': None, 'sections': [], } current_section: Optional[Section] = None current_card: Optional[Card] = None current_detail: Optional[Detail] = None content_buffer: List[str] = [] detail_buffer: List[str] = [] # Move local state into a dict so module-level helpers can operate on it state: ParserState = { 'page': page, 'current_section': current_section, 'current_card': current_card, 'current_detail': current_detail, 'content_buffer': content_buffer, 'detail_buffer': detail_buffer, } for line in lines: process_line_with_state(line, state) # Rehydrate locals from state current_section = state['current_section'] current_card = state['current_card'] current_detail = state['current_detail'] content_buffer = state['content_buffer'] detail_buffer = state['detail_buffer'] # Flush remaining content flush_detail_buffer(state) # Flush content buffer to card or section as appropriate if current_card is not None and content_buffer: current_card['content'] = markdown_to_html_lines( '\n'.join(content_buffer).strip()) elif current_section is not None and content_buffer: current_section['content'] = markdown_to_html_lines( '\n'.join(content_buffer).strip()) if page['title'] is None: filename = os.path.basename(file_path) page['title'] = os.path.splitext(filename)[0].replace('_', ' ').title() return page def flush_detail_buffer(state: ParserState) -> None: """Flush the detail buffer into the current detail entry.""" current_detail = state['current_detail'] detail_buffer = state['detail_buffer'] if current_detail is not None and detail_buffer: current_detail['content'] = markdown_to_html_lines( '\n'.join(detail_buffer).strip()) state['detail_buffer'] = [] def flush_content_buffer_to_card(state: ParserState) -> None: """Flush the content buffer into the current card.""" current_card = state['current_card'] content_buffer = state['content_buffer'] if current_card is not None and content_buffer: current_card['content'] = markdown_to_html_lines( '\n'.join(content_buffer).strip()) def flush_content_buffer_to_section(state: ParserState) -> None: """Flush the content buffer into the current section.""" current_section = state['current_section'] content_buffer = state['content_buffer'] if current_section is not None and content_buffer: current_section['content'] = markdown_to_html_lines( '\n'.join(content_buffer).strip()) def start_section(title: str, state: ParserState) -> None: """Start a new section with the given title.""" if state['current_card'] is not None: flush_content_buffer_to_card(state) state['content_buffer'] = [] state['current_card'] = None if state['current_section'] is not None: flush_content_buffer_to_section(state) section = cast(Section, { 'title': title, 'content': '', 'cards': [] }) sections = state['page'].get('sections', []) sections.append(section) state['page']['sections'] = sections state['current_section'] = section state['content_buffer'] = [] state['current_card'] = None def start_card(title: str, state: ParserState) -> None: """Start a new card within the current section.""" if state['current_section'] is None: return if state['current_card'] is None: flush_content_buffer_to_section(state) else: flush_content_buffer_to_card(state) card = cast(Card, { 'title': title, 'content': '' }) state['current_section']['cards'].append(card) state['current_card'] = card state['content_buffer'] = [] def start_detail(title: str, state: ParserState) -> None: """Start a new detail within the current card.""" if state['current_card'] is None: return if state['current_detail'] is not None and state['detail_buffer']: state['current_detail']['content'] = markdown_to_html_lines( '\n'.join(state['detail_buffer']).strip()) state['detail_buffer'] = [] detail = cast(Detail, { 'title': title, 'content': '' }) if 'details' not in state['current_card']: state['current_card']['details'] = [] state['current_card']['details'].append(detail) state['current_detail'] = detail def process_line_with_state(line: str, state: ParserState) -> None: """Process a single markdown line, updating the provided state dict.""" if line.startswith('# '): # H1 - page title state['page']['title'] = line[2:].strip() # intro content before sections if state['current_section'] is not None and state['content_buffer']: flush_content_buffer_to_section(state) state['current_section'] = None elif line.startswith('## '): # H2 - major section flush_detail_buffer(state) title = line[3:].strip() title = check_image_in_title(title) start_section(title, state) elif line.startswith('### '): # H3 - card or subsection flush_detail_buffer(state) title = line[4:].strip() title = check_image_in_title(title) start_card(title, state) elif line.startswith('#### '): # H4 - detail inside a card if state['current_card'] is not None: title = line[5:].strip() title = check_image_in_title(title) start_detail(title, state) else: state['content_buffer'].append(line) elif line.strip(): if state['current_detail'] is not None: state['detail_buffer'].append(line) else: state['content_buffer'].append(line) def check_image_in_title(title: str) -> str: """ Check if there is an image in the title, preserve original title text and create HTML img tag with alt text and src. """ img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)' match = re.search(img_pattern, title) if match: alt_text = match.group(1).strip() src = match.group(2).strip() img_tag = f'{alt_text}' title = re.sub(img_pattern, img_tag, title).strip() return title def markdown_to_html_lines(text: str) -> str: """ Convert Markdown text to HTML. """ if not text: return '' md = markdown.Markdown() html = md.convert(text) def _is_unsafe(href: str) -> bool: lower = href.strip().lower() return lower.startswith('javascript:') or lower.startswith( 'data:') or lower.startswith('vbscript:') soup = BeautifulSoup(html, 'html.parser') for anchor in soup.find_all('a'): href = anchor.get('href') if isinstance(href, (list, tuple)): href = href[0] href = (href or '').strip() if not href: continue if _is_unsafe(href): anchor['href'] = '#unsafe' anchor.attrs.pop('target', None) anchor.attrs.pop('rel', None) continue if href.startswith('http://') or href.startswith('https://'): anchor['target'] = '_blank' anchor['rel'] = 'noopener noreferrer' for image in soup.find_all('img'): src = image.get('src') or '' if isinstance(src, (list, tuple)): src = src[0] src = src.strip() if not src: image.decompose() continue alt = image.get('alt') if isinstance(alt, (list, tuple)): alt = alt[0] alt_text = (alt or '').strip() # Determine final src for relative paths if not (src.startswith('http://') or src.startswith('https://') or src.startswith('/') or src.startswith('img/')): filename = os.path.basename(src) src = f'img/{filename}' if filename else src image['src'] = src if not alt_text: alt_text = os.path.splitext(os.path.basename(src))[ 0].replace('-', ' ').replace('_', ' ').strip() image['alt'] = alt_text return str(soup)