Add new templates and tests for improved functionality

- Created index.html template for the homepage with service cards and partner logos. - Added page_from_md.html template for rendering pages from markdown. - Developed services.html template detailing various services offered. - Implemented tests for link handling in markdown, ensuring external links open in new tabs and internal links function correctly. - Enhanced markdown parser tests to validate heading extraction, content rendering, and link safety. - Introduced utility tests for template rendering, HTML minification, and JavaScript minification. Co-authored-by: Copilot <copilot@github.com>
2026-05-02 13:05:43 +02:00
parent 559a6e4c56
commit 9f0a216c5e
79 changed files with 4700 additions and 0 deletions
@@ -0,0 +1,398 @@
+"""
+Markdown parser for converting Markdown files into structured component data.
+
+This module reads Markdown files and returns a structured representation that maps
+heading levels to component types:
+  - H1 (#) -> page title / hero
+  - H2 (##) -> major sections
+  - H3 (###) -> cards or subsections within sections
+  - Lists -> converted to component-compatible format
+"""
+
+import os
+import re
+import textwrap
+import markdown
+from bs4 import BeautifulSoup
+from markdown.treeprocessors import Treeprocessor
+from markdown.preprocessors import Preprocessor
+from markdown.extensions import Extension
+from typing import Dict, List, Any, Optional, cast
+from lib.types import PageData, Section, Card, Detail, ParserState
+
+
+class HeadingCollector(Treeprocessor):
+    """
+    Custom Markdown tree processor that collects headings and their content.
+    """
+
+    def __init__(self, md: Any) -> None:
+        super().__init__(md)
+        self.headings: List[Dict[str, Any]] = []
+        self.current_content: List[str] = []
+        self.main_intro: str = ''
+
+    def run(self, root: Any) -> Any:
+        """Process the element tree and collect headings with content."""
+        self.headings = []
+        self.main_intro = ''
+        collecting_intro = False
+        for element in root:
+            if element.tag == 'h1':
+                collecting_intro = True
+            elif collecting_intro and element.tag in ['h2', 'h3', 'h4', 'h5', 'h6']:
+                collecting_intro = False
+            elif collecting_intro:
+                intro_text = self._extract_text(element)
+                if intro_text:
+                    if self.main_intro:
+                        self.main_intro += '\n\n' + intro_text
+                    else:
+                        self.main_intro = intro_text
+            self._process_element(element)
+        return root
+
+    def _process_element(self, element: Any) -> None:
+        """Recursively process elements to extract heading structure."""
+        if element.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            # Convert heading tag to level (e.g., 'h2' -> 2)
+            level = int(element.tag[1])
+            text = self._extract_text(element)
+            self.headings.append({
+                'level': level if level > 1 else 2,  # Treat H1 as level 2 for sectioning
+                'text': text,
+                'tag': element.tag,
+                'element': element,
+            })
+        elif element.tag in ['ul', 'ol']:
+            # Extract list items
+            items: List[str] = []
+            for li in element:
+                items.append(self._extract_text(li))
+            self.headings.append({
+                'type': element.tag,
+                'items': items,
+                'element': element,
+            })
+        else:
+            # Process children
+            for child in element:
+                self._process_element(child)
+
+    def _extract_text(self, element: Any) -> str:
+        """Extract all text from an element and its children."""
+        if element.text:
+            text = element.text
+        else:
+            text = ''
+        for child in element:
+            text += self._extract_text(child)
+            if child.tail:
+                text += child.tail
+        return text.strip()
+
+
+class DedentPreprocessor(Preprocessor):
+    """Normalize leading indentation so headings aren't treated as code blocks."""
+
+    def run(self, lines: List[str]) -> List[str]:
+        text = '\n'.join(lines)
+        dedented = textwrap.dedent(text)
+        return dedented.split('\n')
+
+
+class HeadingExtension(Extension):
+    """Markdown extension to collect headings."""
+
+    def extendMarkdown(self, md: Any) -> None:
+        md.preprocessors.register(
+            DedentPreprocessor(md), 'dedent_preprocessor', 27)
+        md.treeprocessors.register(
+            HeadingCollector(md), 'heading_collector', 5)
+
+
+def parse_markdown_file(file_path: str) -> PageData:
+    """
+    Parse a Markdown file and return a structured representation.
+
+    Args:
+        file_path (str): Path to the Markdown file to parse.
+
+    Returns:
+        dict: A nested dictionary representing the page structure.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+        IOError: If there is an error reading the file.
+    """
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"Markdown file not found: {file_path}")
+
+    with open(file_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+
+    # Parse the content to extract structure
+    return build_component_structure(content, file_path)
+
+
+def get_markdown_files(docs_dir: str = 'docs/en') -> List[str]:
+    """Return Markdown filenames from the given docs directory."""
+    if not os.path.exists(docs_dir):
+        return []
+    return [f for f in os.listdir(docs_dir) if f.endswith('.md')]
+
+
+def markdown_filename_to_html_filename(md_filename: str) -> str:
+    """Convert a Markdown filename to its HTML counterpart."""
+    return md_filename.replace('.md', '.html').lower()
+
+
+def build_component_structure(
+        markdown_content: str, file_path: str) -> PageData:
+    """
+    Build a nested component structure from Markdown content.
+
+    This function parses Markdown headings and content into a hierarchical structure
+    suitable for rendering with component templates.
+    """
+    lines = markdown_content.split('\n')
+    page: PageData = {
+        'title': None,
+        'sections': [],
+    }
+    current_section: Optional[Section] = None
+    current_card: Optional[Card] = None
+    current_detail: Optional[Detail] = None
+    content_buffer: List[str] = []
+    detail_buffer: List[str] = []
+
+    # Move local state into a dict so module-level helpers can operate on it
+    state: ParserState = {
+        'page': page,
+        'current_section': current_section,
+        'current_card': current_card,
+        'current_detail': current_detail,
+        'content_buffer': content_buffer,
+        'detail_buffer': detail_buffer,
+    }
+
+    for line in lines:
+        process_line_with_state(line, state)
+
+    # Rehydrate locals from state
+    current_section = state['current_section']
+    current_card = state['current_card']
+    current_detail = state['current_detail']
+    content_buffer = state['content_buffer']
+    detail_buffer = state['detail_buffer']
+
+    # Flush remaining content
+    flush_detail_buffer(state)
+    # Flush content buffer to card or section as appropriate
+    if current_card is not None and content_buffer:
+        current_card['content'] = markdown_to_html_lines(
+            '\n'.join(content_buffer).strip())
+    elif current_section is not None and content_buffer:
+        current_section['content'] = markdown_to_html_lines(
+            '\n'.join(content_buffer).strip())
+
+    if page['title'] is None:
+        filename = os.path.basename(file_path)
+        page['title'] = os.path.splitext(filename)[0].replace('_', ' ').title()
+
+    return page
+
+
+def flush_detail_buffer(state: ParserState) -> None:
+    """Flush the detail buffer into the current detail entry."""
+    current_detail = state['current_detail']
+    detail_buffer = state['detail_buffer']
+    if current_detail is not None and detail_buffer:
+        current_detail['content'] = markdown_to_html_lines(
+            '\n'.join(detail_buffer).strip())
+        state['detail_buffer'] = []
+
+
+def flush_content_buffer_to_card(state: ParserState) -> None:
+    """Flush the content buffer into the current card."""
+    current_card = state['current_card']
+    content_buffer = state['content_buffer']
+    if current_card is not None and content_buffer:
+        current_card['content'] = markdown_to_html_lines(
+            '\n'.join(content_buffer).strip())
+
+
+def flush_content_buffer_to_section(state: ParserState) -> None:
+    """Flush the content buffer into the current section."""
+    current_section = state['current_section']
+    content_buffer = state['content_buffer']
+    if current_section is not None and content_buffer:
+        current_section['content'] = markdown_to_html_lines(
+            '\n'.join(content_buffer).strip())
+
+
+def start_section(title: str, state: ParserState) -> None:
+    """Start a new section with the given title."""
+    if state['current_card'] is not None:
+        flush_content_buffer_to_card(state)
+        state['content_buffer'] = []
+        state['current_card'] = None
+    if state['current_section'] is not None:
+        flush_content_buffer_to_section(state)
+    section = cast(Section, {
+        'title': title,
+        'content': '',
+        'cards': []
+    })
+    sections = state['page'].get('sections', [])
+    sections.append(section)
+    state['page']['sections'] = sections
+    state['current_section'] = section
+    state['content_buffer'] = []
+    state['current_card'] = None
+
+
+def start_card(title: str, state: ParserState) -> None:
+    """Start a new card within the current section."""
+    if state['current_section'] is None:
+        return
+    if state['current_card'] is None:
+        flush_content_buffer_to_section(state)
+    else:
+        flush_content_buffer_to_card(state)
+    card = cast(Card, {
+        'title': title,
+        'content': ''
+    })
+    state['current_section']['cards'].append(card)
+    state['current_card'] = card
+    state['content_buffer'] = []
+
+
+def start_detail(title: str, state: ParserState) -> None:
+    """Start a new detail within the current card."""
+    if state['current_card'] is None:
+        return
+    if state['current_detail'] is not None and state['detail_buffer']:
+        state['current_detail']['content'] = markdown_to_html_lines(
+            '\n'.join(state['detail_buffer']).strip())
+        state['detail_buffer'] = []
+    detail = cast(Detail, {
+        'title': title,
+        'content': ''
+    })
+    if 'details' not in state['current_card']:
+        state['current_card']['details'] = []
+    state['current_card']['details'].append(detail)
+    state['current_detail'] = detail
+
+
+def process_line_with_state(line: str, state: ParserState) -> None:
+    """Process a single markdown line, updating the provided state dict."""
+    if line.startswith('# '):
+        # H1 - page title
+        state['page']['title'] = line[2:].strip()
+        # intro content before sections
+        if state['current_section'] is not None and state['content_buffer']:
+            flush_content_buffer_to_section(state)
+        state['current_section'] = None
+    elif line.startswith('## '):
+        # H2 - major section
+        flush_detail_buffer(state)
+        title = line[3:].strip()
+        title = check_image_in_title(title)
+        start_section(title, state)
+    elif line.startswith('### '):
+        # H3 - card or subsection
+        flush_detail_buffer(state)
+        title = line[4:].strip()
+        title = check_image_in_title(title)
+        start_card(title, state)
+    elif line.startswith('#### '):
+        # H4 - detail inside a card
+        if state['current_card'] is not None:
+            title = line[5:].strip()
+            title = check_image_in_title(title)
+            start_detail(title, state)
+        else:
+            state['content_buffer'].append(line)
+    elif line.strip():
+        if state['current_detail'] is not None:
+            state['detail_buffer'].append(line)
+        else:
+            state['content_buffer'].append(line)
+
+
+def check_image_in_title(title: str) -> str:
+    """
+    Check if there is an image in the title, preserve original title text and create HTML img tag with alt text and src.
+    """
+    img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
+    match = re.search(img_pattern, title)
+    if match:
+        alt_text = match.group(1).strip()
+        src = match.group(2).strip()
+        img_tag = f'<img src="{src}" alt="{alt_text}"/>'
+        title = re.sub(img_pattern, img_tag, title).strip()
+    return title
+
+
+def markdown_to_html_lines(text: str) -> str:
+    """
+    Convert Markdown text to HTML.
+    """
+    if not text:
+        return ''
+
+    md = markdown.Markdown()
+    html = md.convert(text)
+
+    def _is_unsafe(href: str) -> bool:
+        lower = href.strip().lower()
+        return lower.startswith('javascript:') or lower.startswith(
+            'data:') or lower.startswith('vbscript:')
+
+    soup = BeautifulSoup(html, 'html.parser')
+
+    for anchor in soup.find_all('a'):
+        href = anchor.get('href')
+        if isinstance(href, (list, tuple)):
+            href = href[0]
+        href = (href or '').strip()
+        if not href:
+            continue
+        if _is_unsafe(href):
+            anchor['href'] = '#unsafe'
+            anchor.attrs.pop('target', None)
+            anchor.attrs.pop('rel', None)
+            continue
+        if href.startswith('http://') or href.startswith('https://'):
+            anchor['target'] = '_blank'
+            anchor['rel'] = 'noopener noreferrer'
+
+    for image in soup.find_all('img'):
+        src = image.get('src') or ''
+        if isinstance(src, (list, tuple)):
+            src = src[0]
+        src = src.strip()
+        if not src:
+            image.decompose()
+            continue
+
+        alt = image.get('alt')
+        if isinstance(alt, (list, tuple)):
+            alt = alt[0]
+        alt_text = (alt or '').strip()
+        # Determine final src for relative paths
+        if not (src.startswith('http://') or src.startswith('https://')
+                or src.startswith('/') or src.startswith('img/')):
+            filename = os.path.basename(src)
+            src = f'img/{filename}' if filename else src
+        image['src'] = src
+
+        if not alt_text:
+            alt_text = os.path.splitext(os.path.basename(src))[
+                0].replace('-', ' ').replace('_', ' ').strip()
+        image['alt'] = alt_text
+
+    return str(soup)
@@ -0,0 +1,54 @@
+from typing import TypedDict, List, Dict, Optional
+
+
+class Detail(TypedDict):
+    title: str
+    content: str
+
+
+class Card(TypedDict, total=False):
+    title: str
+    content: str
+    details: List[Detail]
+
+
+class Section(TypedDict):
+    title: str
+    content: str
+    cards: List[Card]
+
+
+class PageData(TypedDict):
+    title: Optional[str]
+    sections: List[Section]
+
+
+class PageMeta(TypedDict, total=False):
+    title: str
+    description: str
+    keywords: str
+    og_description: str
+    favicon: str
+    twitter_image: str
+    og_image: str
+
+
+class PageEntry(TypedDict, total=False):
+    page_title: str
+    page_subtitle: str
+    page_cta: str
+    page_cta_url: str
+    meta: PageMeta
+    active: bool
+
+
+PagesDict = Dict[str, PageEntry]
+
+
+class ParserState(TypedDict):
+    page: PageData
+    current_section: Optional[Section]
+    current_card: Optional[Card]
+    current_detail: Optional[Detail]
+    content_buffer: List[str]
+    detail_buffer: List[str]
@@ -0,0 +1,126 @@
+import os
+import re
+from typing import Dict, List, Any
+from jinja2 import Environment, FileSystemLoader
+from lib.types import PagesDict
+
+DEFAULT_TEMPLATE_DIR = 'templates'
+DEFAULT_OUTPUT_DIR = 'html'
+
+
+def get_template_files(template_dir: str = DEFAULT_TEMPLATE_DIR) -> List[str]:
+    """Return template filenames from the provided template directory."""
+    if not os.path.exists(template_dir):
+        return []
+    return [
+        name
+        for name in os.listdir(template_dir)
+        if name.endswith('.html') and not name.startswith('_')
+    ]
+
+
+def get_css_files(output_dir: str = DEFAULT_OUTPUT_DIR) -> List[str]:
+    """Return CSS file paths contained in the output directory's css folder."""
+    css_dir = os.path.join(output_dir, 'css')
+    if not os.path.exists(css_dir):
+        return []
+    return [
+        os.path.join(css_dir, name)
+        for name in os.listdir(css_dir)
+        if name.endswith('.css')
+    ]
+
+
+def get_js_files(output_dir: str = DEFAULT_OUTPUT_DIR) -> List[str]:
+    """Return JavaScript file paths contained in the output directory's js folder."""
+    js_dir = os.path.join(output_dir, 'js')
+    if not os.path.exists(js_dir):
+        return []
+    return [
+        os.path.join(js_dir, name)
+        for name in os.listdir(js_dir)
+        if name.endswith('.js')
+    ]
+
+
+def render_template(
+    template_name: str,
+    context: Dict[str, Any],
+    template_dir: str = DEFAULT_TEMPLATE_DIR) -> str:
+    """Render a Jinja2 template with the provided context."""
+    env = Environment(loader=FileSystemLoader(template_dir))
+    template = env.get_template(template_name)
+    return template.render(context)
+
+
+def set_active_page_by_url(pages: PagesDict, page_url: str) -> None:
+    """Mutate navigation dictionary to mark the active page."""
+    for key, value in pages.items():
+        # value is a PageEntry TypedDict
+        value['active'] = key == page_url
+
+
+def minify_js(js: str) -> str:
+    """Minify JavaScript by removing comments and redundant whitespace."""
+    js = re.sub(r'//.*?\n|/\*.*?\*/', '', js, flags=re.DOTALL)
+    js = re.sub(r'\s+', ' ', js)
+    js = re.sub(r';\s+', ';\n', js)
+    js = re.sub(r'\n+', ' ', js)
+    return js.strip()
+
+
+def minify_css(css: str) -> str:
+    """Minify CSS by removing comments and redundant whitespace."""
+    css = re.sub(r'/\*.*?\*/', '', css, flags=re.DOTALL)
+    css = re.sub(r'\s+', ' ', css)
+    css = re.sub(r';\s+', ';\n', css)
+    css = re.sub(r'\s+([\{\s])', r'\1', css)
+    css = re.sub(r'\s+}', '}', css)
+    css = re.sub(r'\n+', ' ', css)
+    return css.strip()
+
+
+def minify_html(html: str) -> str:
+    """Minify HTML by removing comments and redundant whitespace."""
+    html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
+    html = re.sub(r'\s+', ' ', html)
+    html = re.sub(r'>\s+<', '><', html)
+    html = re.sub(r'<\s+', '<', html)
+    html = re.sub(r'\s+</', '</', html)
+    return html.strip()
+
+
+def css_minifier(output_dir: str = DEFAULT_OUTPUT_DIR) -> None:
+    """Minify all CSS files within the output directory."""
+    for css_path in get_css_files(output_dir):
+        with open(css_path, 'r', encoding='utf-8') as handle:
+            content = handle.read()
+        minified = minify_css(content)
+        with open(css_path, 'w', encoding='utf-8') as handle:
+            handle.write(minified)
+
+
+def js_minifier(output_dir: str = DEFAULT_OUTPUT_DIR) -> None:
+    """Minify all JavaScript files within the output directory."""
+    for js_path in get_js_files(output_dir):
+        with open(js_path, 'r', encoding='utf-8') as handle:
+            content = handle.read()
+        minified = minify_js(content)
+        with open(js_path, 'w', encoding='utf-8') as handle:
+            handle.write(minified)
+
+
+__all__ = [
+    'DEFAULT_OUTPUT_DIR',
+    'DEFAULT_TEMPLATE_DIR',
+    'get_template_files',
+    'get_css_files',
+    'get_js_files',
+    'render_template',
+    'set_active_page_by_url',
+    'minify_js',
+    'minify_css',
+    'minify_html',
+    'css_minifier',
+    'js_minifier',
+]