Files
zwitschi 9f0a216c5e Add new templates and tests for improved functionality
- Created index.html template for the homepage with service cards and partner logos.
- Added page_from_md.html template for rendering pages from markdown.
- Developed services.html template detailing various services offered.
- Implemented tests for link handling in markdown, ensuring external links open in new tabs and internal links function correctly.
- Enhanced markdown parser tests to validate heading extraction, content rendering, and link safety.
- Introduced utility tests for template rendering, HTML minification, and JavaScript minification.

Co-authored-by: Copilot <copilot@github.com>
2026-05-02 13:05:43 +02:00

399 lines
13 KiB
Python

"""
Markdown parser for converting Markdown files into structured component data.
This module reads Markdown files and returns a structured representation that maps
heading levels to component types:
- H1 (#) -> page title / hero
- H2 (##) -> major sections
- H3 (###) -> cards or subsections within sections
- Lists -> converted to component-compatible format
"""
import os
import re
import textwrap
import markdown
from bs4 import BeautifulSoup
from markdown.treeprocessors import Treeprocessor
from markdown.preprocessors import Preprocessor
from markdown.extensions import Extension
from typing import Dict, List, Any, Optional, cast
from lib.types import PageData, Section, Card, Detail, ParserState
class HeadingCollector(Treeprocessor):
"""
Custom Markdown tree processor that collects headings and their content.
"""
def __init__(self, md: Any) -> None:
super().__init__(md)
self.headings: List[Dict[str, Any]] = []
self.current_content: List[str] = []
self.main_intro: str = ''
def run(self, root: Any) -> Any:
"""Process the element tree and collect headings with content."""
self.headings = []
self.main_intro = ''
collecting_intro = False
for element in root:
if element.tag == 'h1':
collecting_intro = True
elif collecting_intro and element.tag in ['h2', 'h3', 'h4', 'h5', 'h6']:
collecting_intro = False
elif collecting_intro:
intro_text = self._extract_text(element)
if intro_text:
if self.main_intro:
self.main_intro += '\n\n' + intro_text
else:
self.main_intro = intro_text
self._process_element(element)
return root
def _process_element(self, element: Any) -> None:
"""Recursively process elements to extract heading structure."""
if element.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
# Convert heading tag to level (e.g., 'h2' -> 2)
level = int(element.tag[1])
text = self._extract_text(element)
self.headings.append({
'level': level if level > 1 else 2, # Treat H1 as level 2 for sectioning
'text': text,
'tag': element.tag,
'element': element,
})
elif element.tag in ['ul', 'ol']:
# Extract list items
items: List[str] = []
for li in element:
items.append(self._extract_text(li))
self.headings.append({
'type': element.tag,
'items': items,
'element': element,
})
else:
# Process children
for child in element:
self._process_element(child)
def _extract_text(self, element: Any) -> str:
"""Extract all text from an element and its children."""
if element.text:
text = element.text
else:
text = ''
for child in element:
text += self._extract_text(child)
if child.tail:
text += child.tail
return text.strip()
class DedentPreprocessor(Preprocessor):
"""Normalize leading indentation so headings aren't treated as code blocks."""
def run(self, lines: List[str]) -> List[str]:
text = '\n'.join(lines)
dedented = textwrap.dedent(text)
return dedented.split('\n')
class HeadingExtension(Extension):
"""Markdown extension to collect headings."""
def extendMarkdown(self, md: Any) -> None:
md.preprocessors.register(
DedentPreprocessor(md), 'dedent_preprocessor', 27)
md.treeprocessors.register(
HeadingCollector(md), 'heading_collector', 5)
def parse_markdown_file(file_path: str) -> PageData:
"""
Parse a Markdown file and return a structured representation.
Args:
file_path (str): Path to the Markdown file to parse.
Returns:
dict: A nested dictionary representing the page structure.
Raises:
FileNotFoundError: If the file does not exist.
IOError: If there is an error reading the file.
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"Markdown file not found: {file_path}")
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Parse the content to extract structure
return build_component_structure(content, file_path)
def get_markdown_files(docs_dir: str = 'docs/en') -> List[str]:
"""Return Markdown filenames from the given docs directory."""
if not os.path.exists(docs_dir):
return []
return [f for f in os.listdir(docs_dir) if f.endswith('.md')]
def markdown_filename_to_html_filename(md_filename: str) -> str:
"""Convert a Markdown filename to its HTML counterpart."""
return md_filename.replace('.md', '.html').lower()
def build_component_structure(
markdown_content: str, file_path: str) -> PageData:
"""
Build a nested component structure from Markdown content.
This function parses Markdown headings and content into a hierarchical structure
suitable for rendering with component templates.
"""
lines = markdown_content.split('\n')
page: PageData = {
'title': None,
'sections': [],
}
current_section: Optional[Section] = None
current_card: Optional[Card] = None
current_detail: Optional[Detail] = None
content_buffer: List[str] = []
detail_buffer: List[str] = []
# Move local state into a dict so module-level helpers can operate on it
state: ParserState = {
'page': page,
'current_section': current_section,
'current_card': current_card,
'current_detail': current_detail,
'content_buffer': content_buffer,
'detail_buffer': detail_buffer,
}
for line in lines:
process_line_with_state(line, state)
# Rehydrate locals from state
current_section = state['current_section']
current_card = state['current_card']
current_detail = state['current_detail']
content_buffer = state['content_buffer']
detail_buffer = state['detail_buffer']
# Flush remaining content
flush_detail_buffer(state)
# Flush content buffer to card or section as appropriate
if current_card is not None and content_buffer:
current_card['content'] = markdown_to_html_lines(
'\n'.join(content_buffer).strip())
elif current_section is not None and content_buffer:
current_section['content'] = markdown_to_html_lines(
'\n'.join(content_buffer).strip())
if page['title'] is None:
filename = os.path.basename(file_path)
page['title'] = os.path.splitext(filename)[0].replace('_', ' ').title()
return page
def flush_detail_buffer(state: ParserState) -> None:
"""Flush the detail buffer into the current detail entry."""
current_detail = state['current_detail']
detail_buffer = state['detail_buffer']
if current_detail is not None and detail_buffer:
current_detail['content'] = markdown_to_html_lines(
'\n'.join(detail_buffer).strip())
state['detail_buffer'] = []
def flush_content_buffer_to_card(state: ParserState) -> None:
"""Flush the content buffer into the current card."""
current_card = state['current_card']
content_buffer = state['content_buffer']
if current_card is not None and content_buffer:
current_card['content'] = markdown_to_html_lines(
'\n'.join(content_buffer).strip())
def flush_content_buffer_to_section(state: ParserState) -> None:
"""Flush the content buffer into the current section."""
current_section = state['current_section']
content_buffer = state['content_buffer']
if current_section is not None and content_buffer:
current_section['content'] = markdown_to_html_lines(
'\n'.join(content_buffer).strip())
def start_section(title: str, state: ParserState) -> None:
"""Start a new section with the given title."""
if state['current_card'] is not None:
flush_content_buffer_to_card(state)
state['content_buffer'] = []
state['current_card'] = None
if state['current_section'] is not None:
flush_content_buffer_to_section(state)
section = cast(Section, {
'title': title,
'content': '',
'cards': []
})
sections = state['page'].get('sections', [])
sections.append(section)
state['page']['sections'] = sections
state['current_section'] = section
state['content_buffer'] = []
state['current_card'] = None
def start_card(title: str, state: ParserState) -> None:
"""Start a new card within the current section."""
if state['current_section'] is None:
return
if state['current_card'] is None:
flush_content_buffer_to_section(state)
else:
flush_content_buffer_to_card(state)
card = cast(Card, {
'title': title,
'content': ''
})
state['current_section']['cards'].append(card)
state['current_card'] = card
state['content_buffer'] = []
def start_detail(title: str, state: ParserState) -> None:
"""Start a new detail within the current card."""
if state['current_card'] is None:
return
if state['current_detail'] is not None and state['detail_buffer']:
state['current_detail']['content'] = markdown_to_html_lines(
'\n'.join(state['detail_buffer']).strip())
state['detail_buffer'] = []
detail = cast(Detail, {
'title': title,
'content': ''
})
if 'details' not in state['current_card']:
state['current_card']['details'] = []
state['current_card']['details'].append(detail)
state['current_detail'] = detail
def process_line_with_state(line: str, state: ParserState) -> None:
"""Process a single markdown line, updating the provided state dict."""
if line.startswith('# '):
# H1 - page title
state['page']['title'] = line[2:].strip()
# intro content before sections
if state['current_section'] is not None and state['content_buffer']:
flush_content_buffer_to_section(state)
state['current_section'] = None
elif line.startswith('## '):
# H2 - major section
flush_detail_buffer(state)
title = line[3:].strip()
title = check_image_in_title(title)
start_section(title, state)
elif line.startswith('### '):
# H3 - card or subsection
flush_detail_buffer(state)
title = line[4:].strip()
title = check_image_in_title(title)
start_card(title, state)
elif line.startswith('#### '):
# H4 - detail inside a card
if state['current_card'] is not None:
title = line[5:].strip()
title = check_image_in_title(title)
start_detail(title, state)
else:
state['content_buffer'].append(line)
elif line.strip():
if state['current_detail'] is not None:
state['detail_buffer'].append(line)
else:
state['content_buffer'].append(line)
def check_image_in_title(title: str) -> str:
"""
Check if there is an image in the title, preserve original title text and create HTML img tag with alt text and src.
"""
img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
match = re.search(img_pattern, title)
if match:
alt_text = match.group(1).strip()
src = match.group(2).strip()
img_tag = f'<img src="{src}" alt="{alt_text}"/>'
title = re.sub(img_pattern, img_tag, title).strip()
return title
def markdown_to_html_lines(text: str) -> str:
"""
Convert Markdown text to HTML.
"""
if not text:
return ''
md = markdown.Markdown()
html = md.convert(text)
def _is_unsafe(href: str) -> bool:
lower = href.strip().lower()
return lower.startswith('javascript:') or lower.startswith(
'data:') or lower.startswith('vbscript:')
soup = BeautifulSoup(html, 'html.parser')
for anchor in soup.find_all('a'):
href = anchor.get('href')
if isinstance(href, (list, tuple)):
href = href[0]
href = (href or '').strip()
if not href:
continue
if _is_unsafe(href):
anchor['href'] = '#unsafe'
anchor.attrs.pop('target', None)
anchor.attrs.pop('rel', None)
continue
if href.startswith('http://') or href.startswith('https://'):
anchor['target'] = '_blank'
anchor['rel'] = 'noopener noreferrer'
for image in soup.find_all('img'):
src = image.get('src') or ''
if isinstance(src, (list, tuple)):
src = src[0]
src = src.strip()
if not src:
image.decompose()
continue
alt = image.get('alt')
if isinstance(alt, (list, tuple)):
alt = alt[0]
alt_text = (alt or '').strip()
# Determine final src for relative paths
if not (src.startswith('http://') or src.startswith('https://')
or src.startswith('/') or src.startswith('img/')):
filename = os.path.basename(src)
src = f'img/{filename}' if filename else src
image['src'] = src
if not alt_text:
alt_text = os.path.splitext(os.path.basename(src))[
0].replace('-', ' ').replace('_', ' ').strip()
image['alt'] = alt_text
return str(soup)