9f0a216c5e
- Created index.html template for the homepage with service cards and partner logos. - Added page_from_md.html template for rendering pages from markdown. - Developed services.html template detailing various services offered. - Implemented tests for link handling in markdown, ensuring external links open in new tabs and internal links function correctly. - Enhanced markdown parser tests to validate heading extraction, content rendering, and link safety. - Introduced utility tests for template rendering, HTML minification, and JavaScript minification. Co-authored-by: Copilot <copilot@github.com>
399 lines
13 KiB
Python
399 lines
13 KiB
Python
"""
|
|
Markdown parser for converting Markdown files into structured component data.
|
|
|
|
This module reads Markdown files and returns a structured representation that maps
|
|
heading levels to component types:
|
|
- H1 (#) -> page title / hero
|
|
- H2 (##) -> major sections
|
|
- H3 (###) -> cards or subsections within sections
|
|
- Lists -> converted to component-compatible format
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import textwrap
|
|
import markdown
|
|
from bs4 import BeautifulSoup
|
|
from markdown.treeprocessors import Treeprocessor
|
|
from markdown.preprocessors import Preprocessor
|
|
from markdown.extensions import Extension
|
|
from typing import Dict, List, Any, Optional, cast
|
|
from lib.types import PageData, Section, Card, Detail, ParserState
|
|
|
|
|
|
class HeadingCollector(Treeprocessor):
|
|
"""
|
|
Custom Markdown tree processor that collects headings and their content.
|
|
"""
|
|
|
|
def __init__(self, md: Any) -> None:
|
|
super().__init__(md)
|
|
self.headings: List[Dict[str, Any]] = []
|
|
self.current_content: List[str] = []
|
|
self.main_intro: str = ''
|
|
|
|
def run(self, root: Any) -> Any:
|
|
"""Process the element tree and collect headings with content."""
|
|
self.headings = []
|
|
self.main_intro = ''
|
|
collecting_intro = False
|
|
for element in root:
|
|
if element.tag == 'h1':
|
|
collecting_intro = True
|
|
elif collecting_intro and element.tag in ['h2', 'h3', 'h4', 'h5', 'h6']:
|
|
collecting_intro = False
|
|
elif collecting_intro:
|
|
intro_text = self._extract_text(element)
|
|
if intro_text:
|
|
if self.main_intro:
|
|
self.main_intro += '\n\n' + intro_text
|
|
else:
|
|
self.main_intro = intro_text
|
|
self._process_element(element)
|
|
return root
|
|
|
|
def _process_element(self, element: Any) -> None:
|
|
"""Recursively process elements to extract heading structure."""
|
|
if element.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
|
# Convert heading tag to level (e.g., 'h2' -> 2)
|
|
level = int(element.tag[1])
|
|
text = self._extract_text(element)
|
|
self.headings.append({
|
|
'level': level if level > 1 else 2, # Treat H1 as level 2 for sectioning
|
|
'text': text,
|
|
'tag': element.tag,
|
|
'element': element,
|
|
})
|
|
elif element.tag in ['ul', 'ol']:
|
|
# Extract list items
|
|
items: List[str] = []
|
|
for li in element:
|
|
items.append(self._extract_text(li))
|
|
self.headings.append({
|
|
'type': element.tag,
|
|
'items': items,
|
|
'element': element,
|
|
})
|
|
else:
|
|
# Process children
|
|
for child in element:
|
|
self._process_element(child)
|
|
|
|
def _extract_text(self, element: Any) -> str:
|
|
"""Extract all text from an element and its children."""
|
|
if element.text:
|
|
text = element.text
|
|
else:
|
|
text = ''
|
|
for child in element:
|
|
text += self._extract_text(child)
|
|
if child.tail:
|
|
text += child.tail
|
|
return text.strip()
|
|
|
|
|
|
class DedentPreprocessor(Preprocessor):
|
|
"""Normalize leading indentation so headings aren't treated as code blocks."""
|
|
|
|
def run(self, lines: List[str]) -> List[str]:
|
|
text = '\n'.join(lines)
|
|
dedented = textwrap.dedent(text)
|
|
return dedented.split('\n')
|
|
|
|
|
|
class HeadingExtension(Extension):
|
|
"""Markdown extension to collect headings."""
|
|
|
|
def extendMarkdown(self, md: Any) -> None:
|
|
md.preprocessors.register(
|
|
DedentPreprocessor(md), 'dedent_preprocessor', 27)
|
|
md.treeprocessors.register(
|
|
HeadingCollector(md), 'heading_collector', 5)
|
|
|
|
|
|
def parse_markdown_file(file_path: str) -> PageData:
|
|
"""
|
|
Parse a Markdown file and return a structured representation.
|
|
|
|
Args:
|
|
file_path (str): Path to the Markdown file to parse.
|
|
|
|
Returns:
|
|
dict: A nested dictionary representing the page structure.
|
|
|
|
Raises:
|
|
FileNotFoundError: If the file does not exist.
|
|
IOError: If there is an error reading the file.
|
|
"""
|
|
if not os.path.exists(file_path):
|
|
raise FileNotFoundError(f"Markdown file not found: {file_path}")
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Parse the content to extract structure
|
|
return build_component_structure(content, file_path)
|
|
|
|
|
|
def get_markdown_files(docs_dir: str = 'docs/en') -> List[str]:
|
|
"""Return Markdown filenames from the given docs directory."""
|
|
if not os.path.exists(docs_dir):
|
|
return []
|
|
return [f for f in os.listdir(docs_dir) if f.endswith('.md')]
|
|
|
|
|
|
def markdown_filename_to_html_filename(md_filename: str) -> str:
|
|
"""Convert a Markdown filename to its HTML counterpart."""
|
|
return md_filename.replace('.md', '.html').lower()
|
|
|
|
|
|
def build_component_structure(
|
|
markdown_content: str, file_path: str) -> PageData:
|
|
"""
|
|
Build a nested component structure from Markdown content.
|
|
|
|
This function parses Markdown headings and content into a hierarchical structure
|
|
suitable for rendering with component templates.
|
|
"""
|
|
lines = markdown_content.split('\n')
|
|
page: PageData = {
|
|
'title': None,
|
|
'sections': [],
|
|
}
|
|
current_section: Optional[Section] = None
|
|
current_card: Optional[Card] = None
|
|
current_detail: Optional[Detail] = None
|
|
content_buffer: List[str] = []
|
|
detail_buffer: List[str] = []
|
|
|
|
# Move local state into a dict so module-level helpers can operate on it
|
|
state: ParserState = {
|
|
'page': page,
|
|
'current_section': current_section,
|
|
'current_card': current_card,
|
|
'current_detail': current_detail,
|
|
'content_buffer': content_buffer,
|
|
'detail_buffer': detail_buffer,
|
|
}
|
|
|
|
for line in lines:
|
|
process_line_with_state(line, state)
|
|
|
|
# Rehydrate locals from state
|
|
current_section = state['current_section']
|
|
current_card = state['current_card']
|
|
current_detail = state['current_detail']
|
|
content_buffer = state['content_buffer']
|
|
detail_buffer = state['detail_buffer']
|
|
|
|
# Flush remaining content
|
|
flush_detail_buffer(state)
|
|
# Flush content buffer to card or section as appropriate
|
|
if current_card is not None and content_buffer:
|
|
current_card['content'] = markdown_to_html_lines(
|
|
'\n'.join(content_buffer).strip())
|
|
elif current_section is not None and content_buffer:
|
|
current_section['content'] = markdown_to_html_lines(
|
|
'\n'.join(content_buffer).strip())
|
|
|
|
if page['title'] is None:
|
|
filename = os.path.basename(file_path)
|
|
page['title'] = os.path.splitext(filename)[0].replace('_', ' ').title()
|
|
|
|
return page
|
|
|
|
|
|
def flush_detail_buffer(state: ParserState) -> None:
|
|
"""Flush the detail buffer into the current detail entry."""
|
|
current_detail = state['current_detail']
|
|
detail_buffer = state['detail_buffer']
|
|
if current_detail is not None and detail_buffer:
|
|
current_detail['content'] = markdown_to_html_lines(
|
|
'\n'.join(detail_buffer).strip())
|
|
state['detail_buffer'] = []
|
|
|
|
|
|
def flush_content_buffer_to_card(state: ParserState) -> None:
|
|
"""Flush the content buffer into the current card."""
|
|
current_card = state['current_card']
|
|
content_buffer = state['content_buffer']
|
|
if current_card is not None and content_buffer:
|
|
current_card['content'] = markdown_to_html_lines(
|
|
'\n'.join(content_buffer).strip())
|
|
|
|
|
|
def flush_content_buffer_to_section(state: ParserState) -> None:
|
|
"""Flush the content buffer into the current section."""
|
|
current_section = state['current_section']
|
|
content_buffer = state['content_buffer']
|
|
if current_section is not None and content_buffer:
|
|
current_section['content'] = markdown_to_html_lines(
|
|
'\n'.join(content_buffer).strip())
|
|
|
|
|
|
def start_section(title: str, state: ParserState) -> None:
|
|
"""Start a new section with the given title."""
|
|
if state['current_card'] is not None:
|
|
flush_content_buffer_to_card(state)
|
|
state['content_buffer'] = []
|
|
state['current_card'] = None
|
|
if state['current_section'] is not None:
|
|
flush_content_buffer_to_section(state)
|
|
section = cast(Section, {
|
|
'title': title,
|
|
'content': '',
|
|
'cards': []
|
|
})
|
|
sections = state['page'].get('sections', [])
|
|
sections.append(section)
|
|
state['page']['sections'] = sections
|
|
state['current_section'] = section
|
|
state['content_buffer'] = []
|
|
state['current_card'] = None
|
|
|
|
|
|
def start_card(title: str, state: ParserState) -> None:
|
|
"""Start a new card within the current section."""
|
|
if state['current_section'] is None:
|
|
return
|
|
if state['current_card'] is None:
|
|
flush_content_buffer_to_section(state)
|
|
else:
|
|
flush_content_buffer_to_card(state)
|
|
card = cast(Card, {
|
|
'title': title,
|
|
'content': ''
|
|
})
|
|
state['current_section']['cards'].append(card)
|
|
state['current_card'] = card
|
|
state['content_buffer'] = []
|
|
|
|
|
|
def start_detail(title: str, state: ParserState) -> None:
|
|
"""Start a new detail within the current card."""
|
|
if state['current_card'] is None:
|
|
return
|
|
if state['current_detail'] is not None and state['detail_buffer']:
|
|
state['current_detail']['content'] = markdown_to_html_lines(
|
|
'\n'.join(state['detail_buffer']).strip())
|
|
state['detail_buffer'] = []
|
|
detail = cast(Detail, {
|
|
'title': title,
|
|
'content': ''
|
|
})
|
|
if 'details' not in state['current_card']:
|
|
state['current_card']['details'] = []
|
|
state['current_card']['details'].append(detail)
|
|
state['current_detail'] = detail
|
|
|
|
|
|
def process_line_with_state(line: str, state: ParserState) -> None:
|
|
"""Process a single markdown line, updating the provided state dict."""
|
|
if line.startswith('# '):
|
|
# H1 - page title
|
|
state['page']['title'] = line[2:].strip()
|
|
# intro content before sections
|
|
if state['current_section'] is not None and state['content_buffer']:
|
|
flush_content_buffer_to_section(state)
|
|
state['current_section'] = None
|
|
elif line.startswith('## '):
|
|
# H2 - major section
|
|
flush_detail_buffer(state)
|
|
title = line[3:].strip()
|
|
title = check_image_in_title(title)
|
|
start_section(title, state)
|
|
elif line.startswith('### '):
|
|
# H3 - card or subsection
|
|
flush_detail_buffer(state)
|
|
title = line[4:].strip()
|
|
title = check_image_in_title(title)
|
|
start_card(title, state)
|
|
elif line.startswith('#### '):
|
|
# H4 - detail inside a card
|
|
if state['current_card'] is not None:
|
|
title = line[5:].strip()
|
|
title = check_image_in_title(title)
|
|
start_detail(title, state)
|
|
else:
|
|
state['content_buffer'].append(line)
|
|
elif line.strip():
|
|
if state['current_detail'] is not None:
|
|
state['detail_buffer'].append(line)
|
|
else:
|
|
state['content_buffer'].append(line)
|
|
|
|
|
|
def check_image_in_title(title: str) -> str:
|
|
"""
|
|
Check if there is an image in the title, preserve original title text and create HTML img tag with alt text and src.
|
|
"""
|
|
img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
|
|
match = re.search(img_pattern, title)
|
|
if match:
|
|
alt_text = match.group(1).strip()
|
|
src = match.group(2).strip()
|
|
img_tag = f'<img src="{src}" alt="{alt_text}"/>'
|
|
title = re.sub(img_pattern, img_tag, title).strip()
|
|
return title
|
|
|
|
|
|
def markdown_to_html_lines(text: str) -> str:
|
|
"""
|
|
Convert Markdown text to HTML.
|
|
"""
|
|
if not text:
|
|
return ''
|
|
|
|
md = markdown.Markdown()
|
|
html = md.convert(text)
|
|
|
|
def _is_unsafe(href: str) -> bool:
|
|
lower = href.strip().lower()
|
|
return lower.startswith('javascript:') or lower.startswith(
|
|
'data:') or lower.startswith('vbscript:')
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
for anchor in soup.find_all('a'):
|
|
href = anchor.get('href')
|
|
if isinstance(href, (list, tuple)):
|
|
href = href[0]
|
|
href = (href or '').strip()
|
|
if not href:
|
|
continue
|
|
if _is_unsafe(href):
|
|
anchor['href'] = '#unsafe'
|
|
anchor.attrs.pop('target', None)
|
|
anchor.attrs.pop('rel', None)
|
|
continue
|
|
if href.startswith('http://') or href.startswith('https://'):
|
|
anchor['target'] = '_blank'
|
|
anchor['rel'] = 'noopener noreferrer'
|
|
|
|
for image in soup.find_all('img'):
|
|
src = image.get('src') or ''
|
|
if isinstance(src, (list, tuple)):
|
|
src = src[0]
|
|
src = src.strip()
|
|
if not src:
|
|
image.decompose()
|
|
continue
|
|
|
|
alt = image.get('alt')
|
|
if isinstance(alt, (list, tuple)):
|
|
alt = alt[0]
|
|
alt_text = (alt or '').strip()
|
|
# Determine final src for relative paths
|
|
if not (src.startswith('http://') or src.startswith('https://')
|
|
or src.startswith('/') or src.startswith('img/')):
|
|
filename = os.path.basename(src)
|
|
src = f'img/{filename}' if filename else src
|
|
image['src'] = src
|
|
|
|
if not alt_text:
|
|
alt_text = os.path.splitext(os.path.basename(src))[
|
|
0].replace('-', ' ').replace('_', ' ').strip()
|
|
image['alt'] = alt_text
|
|
|
|
return str(soup)
|