Add new templates and tests for improved functionality
- Created index.html template for the homepage with service cards and partner logos. - Added page_from_md.html template for rendering pages from markdown. - Developed services.html template detailing various services offered. - Implemented tests for link handling in markdown, ensuring external links open in new tabs and internal links function correctly. - Enhanced markdown parser tests to validate heading extraction, content rendering, and link safety. - Introduced utility tests for template rendering, HTML minification, and JavaScript minification. Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
@@ -0,0 +1,398 @@
|
||||
"""
|
||||
Markdown parser for converting Markdown files into structured component data.
|
||||
|
||||
This module reads Markdown files and returns a structured representation that maps
|
||||
heading levels to component types:
|
||||
- H1 (#) -> page title / hero
|
||||
- H2 (##) -> major sections
|
||||
- H3 (###) -> cards or subsections within sections
|
||||
- Lists -> converted to component-compatible format
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import textwrap
|
||||
import markdown
|
||||
from bs4 import BeautifulSoup
|
||||
from markdown.treeprocessors import Treeprocessor
|
||||
from markdown.preprocessors import Preprocessor
|
||||
from markdown.extensions import Extension
|
||||
from typing import Dict, List, Any, Optional, cast
|
||||
from lib.types import PageData, Section, Card, Detail, ParserState
|
||||
|
||||
|
||||
class HeadingCollector(Treeprocessor):
|
||||
"""
|
||||
Custom Markdown tree processor that collects headings and their content.
|
||||
"""
|
||||
|
||||
def __init__(self, md: Any) -> None:
|
||||
super().__init__(md)
|
||||
self.headings: List[Dict[str, Any]] = []
|
||||
self.current_content: List[str] = []
|
||||
self.main_intro: str = ''
|
||||
|
||||
def run(self, root: Any) -> Any:
|
||||
"""Process the element tree and collect headings with content."""
|
||||
self.headings = []
|
||||
self.main_intro = ''
|
||||
collecting_intro = False
|
||||
for element in root:
|
||||
if element.tag == 'h1':
|
||||
collecting_intro = True
|
||||
elif collecting_intro and element.tag in ['h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
collecting_intro = False
|
||||
elif collecting_intro:
|
||||
intro_text = self._extract_text(element)
|
||||
if intro_text:
|
||||
if self.main_intro:
|
||||
self.main_intro += '\n\n' + intro_text
|
||||
else:
|
||||
self.main_intro = intro_text
|
||||
self._process_element(element)
|
||||
return root
|
||||
|
||||
def _process_element(self, element: Any) -> None:
|
||||
"""Recursively process elements to extract heading structure."""
|
||||
if element.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
# Convert heading tag to level (e.g., 'h2' -> 2)
|
||||
level = int(element.tag[1])
|
||||
text = self._extract_text(element)
|
||||
self.headings.append({
|
||||
'level': level if level > 1 else 2, # Treat H1 as level 2 for sectioning
|
||||
'text': text,
|
||||
'tag': element.tag,
|
||||
'element': element,
|
||||
})
|
||||
elif element.tag in ['ul', 'ol']:
|
||||
# Extract list items
|
||||
items: List[str] = []
|
||||
for li in element:
|
||||
items.append(self._extract_text(li))
|
||||
self.headings.append({
|
||||
'type': element.tag,
|
||||
'items': items,
|
||||
'element': element,
|
||||
})
|
||||
else:
|
||||
# Process children
|
||||
for child in element:
|
||||
self._process_element(child)
|
||||
|
||||
def _extract_text(self, element: Any) -> str:
|
||||
"""Extract all text from an element and its children."""
|
||||
if element.text:
|
||||
text = element.text
|
||||
else:
|
||||
text = ''
|
||||
for child in element:
|
||||
text += self._extract_text(child)
|
||||
if child.tail:
|
||||
text += child.tail
|
||||
return text.strip()
|
||||
|
||||
|
||||
class DedentPreprocessor(Preprocessor):
|
||||
"""Normalize leading indentation so headings aren't treated as code blocks."""
|
||||
|
||||
def run(self, lines: List[str]) -> List[str]:
|
||||
text = '\n'.join(lines)
|
||||
dedented = textwrap.dedent(text)
|
||||
return dedented.split('\n')
|
||||
|
||||
|
||||
class HeadingExtension(Extension):
|
||||
"""Markdown extension to collect headings."""
|
||||
|
||||
def extendMarkdown(self, md: Any) -> None:
|
||||
md.preprocessors.register(
|
||||
DedentPreprocessor(md), 'dedent_preprocessor', 27)
|
||||
md.treeprocessors.register(
|
||||
HeadingCollector(md), 'heading_collector', 5)
|
||||
|
||||
|
||||
def parse_markdown_file(file_path: str) -> PageData:
|
||||
"""
|
||||
Parse a Markdown file and return a structured representation.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the Markdown file to parse.
|
||||
|
||||
Returns:
|
||||
dict: A nested dictionary representing the page structure.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the file does not exist.
|
||||
IOError: If there is an error reading the file.
|
||||
"""
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"Markdown file not found: {file_path}")
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Parse the content to extract structure
|
||||
return build_component_structure(content, file_path)
|
||||
|
||||
|
||||
def get_markdown_files(docs_dir: str = 'docs/en') -> List[str]:
|
||||
"""Return Markdown filenames from the given docs directory."""
|
||||
if not os.path.exists(docs_dir):
|
||||
return []
|
||||
return [f for f in os.listdir(docs_dir) if f.endswith('.md')]
|
||||
|
||||
|
||||
def markdown_filename_to_html_filename(md_filename: str) -> str:
|
||||
"""Convert a Markdown filename to its HTML counterpart."""
|
||||
return md_filename.replace('.md', '.html').lower()
|
||||
|
||||
|
||||
def build_component_structure(
|
||||
markdown_content: str, file_path: str) -> PageData:
|
||||
"""
|
||||
Build a nested component structure from Markdown content.
|
||||
|
||||
This function parses Markdown headings and content into a hierarchical structure
|
||||
suitable for rendering with component templates.
|
||||
"""
|
||||
lines = markdown_content.split('\n')
|
||||
page: PageData = {
|
||||
'title': None,
|
||||
'sections': [],
|
||||
}
|
||||
current_section: Optional[Section] = None
|
||||
current_card: Optional[Card] = None
|
||||
current_detail: Optional[Detail] = None
|
||||
content_buffer: List[str] = []
|
||||
detail_buffer: List[str] = []
|
||||
|
||||
# Move local state into a dict so module-level helpers can operate on it
|
||||
state: ParserState = {
|
||||
'page': page,
|
||||
'current_section': current_section,
|
||||
'current_card': current_card,
|
||||
'current_detail': current_detail,
|
||||
'content_buffer': content_buffer,
|
||||
'detail_buffer': detail_buffer,
|
||||
}
|
||||
|
||||
for line in lines:
|
||||
process_line_with_state(line, state)
|
||||
|
||||
# Rehydrate locals from state
|
||||
current_section = state['current_section']
|
||||
current_card = state['current_card']
|
||||
current_detail = state['current_detail']
|
||||
content_buffer = state['content_buffer']
|
||||
detail_buffer = state['detail_buffer']
|
||||
|
||||
# Flush remaining content
|
||||
flush_detail_buffer(state)
|
||||
# Flush content buffer to card or section as appropriate
|
||||
if current_card is not None and content_buffer:
|
||||
current_card['content'] = markdown_to_html_lines(
|
||||
'\n'.join(content_buffer).strip())
|
||||
elif current_section is not None and content_buffer:
|
||||
current_section['content'] = markdown_to_html_lines(
|
||||
'\n'.join(content_buffer).strip())
|
||||
|
||||
if page['title'] is None:
|
||||
filename = os.path.basename(file_path)
|
||||
page['title'] = os.path.splitext(filename)[0].replace('_', ' ').title()
|
||||
|
||||
return page
|
||||
|
||||
|
||||
def flush_detail_buffer(state: ParserState) -> None:
|
||||
"""Flush the detail buffer into the current detail entry."""
|
||||
current_detail = state['current_detail']
|
||||
detail_buffer = state['detail_buffer']
|
||||
if current_detail is not None and detail_buffer:
|
||||
current_detail['content'] = markdown_to_html_lines(
|
||||
'\n'.join(detail_buffer).strip())
|
||||
state['detail_buffer'] = []
|
||||
|
||||
|
||||
def flush_content_buffer_to_card(state: ParserState) -> None:
|
||||
"""Flush the content buffer into the current card."""
|
||||
current_card = state['current_card']
|
||||
content_buffer = state['content_buffer']
|
||||
if current_card is not None and content_buffer:
|
||||
current_card['content'] = markdown_to_html_lines(
|
||||
'\n'.join(content_buffer).strip())
|
||||
|
||||
|
||||
def flush_content_buffer_to_section(state: ParserState) -> None:
|
||||
"""Flush the content buffer into the current section."""
|
||||
current_section = state['current_section']
|
||||
content_buffer = state['content_buffer']
|
||||
if current_section is not None and content_buffer:
|
||||
current_section['content'] = markdown_to_html_lines(
|
||||
'\n'.join(content_buffer).strip())
|
||||
|
||||
|
||||
def start_section(title: str, state: ParserState) -> None:
|
||||
"""Start a new section with the given title."""
|
||||
if state['current_card'] is not None:
|
||||
flush_content_buffer_to_card(state)
|
||||
state['content_buffer'] = []
|
||||
state['current_card'] = None
|
||||
if state['current_section'] is not None:
|
||||
flush_content_buffer_to_section(state)
|
||||
section = cast(Section, {
|
||||
'title': title,
|
||||
'content': '',
|
||||
'cards': []
|
||||
})
|
||||
sections = state['page'].get('sections', [])
|
||||
sections.append(section)
|
||||
state['page']['sections'] = sections
|
||||
state['current_section'] = section
|
||||
state['content_buffer'] = []
|
||||
state['current_card'] = None
|
||||
|
||||
|
||||
def start_card(title: str, state: ParserState) -> None:
|
||||
"""Start a new card within the current section."""
|
||||
if state['current_section'] is None:
|
||||
return
|
||||
if state['current_card'] is None:
|
||||
flush_content_buffer_to_section(state)
|
||||
else:
|
||||
flush_content_buffer_to_card(state)
|
||||
card = cast(Card, {
|
||||
'title': title,
|
||||
'content': ''
|
||||
})
|
||||
state['current_section']['cards'].append(card)
|
||||
state['current_card'] = card
|
||||
state['content_buffer'] = []
|
||||
|
||||
|
||||
def start_detail(title: str, state: ParserState) -> None:
|
||||
"""Start a new detail within the current card."""
|
||||
if state['current_card'] is None:
|
||||
return
|
||||
if state['current_detail'] is not None and state['detail_buffer']:
|
||||
state['current_detail']['content'] = markdown_to_html_lines(
|
||||
'\n'.join(state['detail_buffer']).strip())
|
||||
state['detail_buffer'] = []
|
||||
detail = cast(Detail, {
|
||||
'title': title,
|
||||
'content': ''
|
||||
})
|
||||
if 'details' not in state['current_card']:
|
||||
state['current_card']['details'] = []
|
||||
state['current_card']['details'].append(detail)
|
||||
state['current_detail'] = detail
|
||||
|
||||
|
||||
def process_line_with_state(line: str, state: ParserState) -> None:
|
||||
"""Process a single markdown line, updating the provided state dict."""
|
||||
if line.startswith('# '):
|
||||
# H1 - page title
|
||||
state['page']['title'] = line[2:].strip()
|
||||
# intro content before sections
|
||||
if state['current_section'] is not None and state['content_buffer']:
|
||||
flush_content_buffer_to_section(state)
|
||||
state['current_section'] = None
|
||||
elif line.startswith('## '):
|
||||
# H2 - major section
|
||||
flush_detail_buffer(state)
|
||||
title = line[3:].strip()
|
||||
title = check_image_in_title(title)
|
||||
start_section(title, state)
|
||||
elif line.startswith('### '):
|
||||
# H3 - card or subsection
|
||||
flush_detail_buffer(state)
|
||||
title = line[4:].strip()
|
||||
title = check_image_in_title(title)
|
||||
start_card(title, state)
|
||||
elif line.startswith('#### '):
|
||||
# H4 - detail inside a card
|
||||
if state['current_card'] is not None:
|
||||
title = line[5:].strip()
|
||||
title = check_image_in_title(title)
|
||||
start_detail(title, state)
|
||||
else:
|
||||
state['content_buffer'].append(line)
|
||||
elif line.strip():
|
||||
if state['current_detail'] is not None:
|
||||
state['detail_buffer'].append(line)
|
||||
else:
|
||||
state['content_buffer'].append(line)
|
||||
|
||||
|
||||
def check_image_in_title(title: str) -> str:
|
||||
"""
|
||||
Check if there is an image in the title, preserve original title text and create HTML img tag with alt text and src.
|
||||
"""
|
||||
img_pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
|
||||
match = re.search(img_pattern, title)
|
||||
if match:
|
||||
alt_text = match.group(1).strip()
|
||||
src = match.group(2).strip()
|
||||
img_tag = f'<img src="{src}" alt="{alt_text}"/>'
|
||||
title = re.sub(img_pattern, img_tag, title).strip()
|
||||
return title
|
||||
|
||||
|
||||
def markdown_to_html_lines(text: str) -> str:
|
||||
"""
|
||||
Convert Markdown text to HTML.
|
||||
"""
|
||||
if not text:
|
||||
return ''
|
||||
|
||||
md = markdown.Markdown()
|
||||
html = md.convert(text)
|
||||
|
||||
def _is_unsafe(href: str) -> bool:
|
||||
lower = href.strip().lower()
|
||||
return lower.startswith('javascript:') or lower.startswith(
|
||||
'data:') or lower.startswith('vbscript:')
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
for anchor in soup.find_all('a'):
|
||||
href = anchor.get('href')
|
||||
if isinstance(href, (list, tuple)):
|
||||
href = href[0]
|
||||
href = (href or '').strip()
|
||||
if not href:
|
||||
continue
|
||||
if _is_unsafe(href):
|
||||
anchor['href'] = '#unsafe'
|
||||
anchor.attrs.pop('target', None)
|
||||
anchor.attrs.pop('rel', None)
|
||||
continue
|
||||
if href.startswith('http://') or href.startswith('https://'):
|
||||
anchor['target'] = '_blank'
|
||||
anchor['rel'] = 'noopener noreferrer'
|
||||
|
||||
for image in soup.find_all('img'):
|
||||
src = image.get('src') or ''
|
||||
if isinstance(src, (list, tuple)):
|
||||
src = src[0]
|
||||
src = src.strip()
|
||||
if not src:
|
||||
image.decompose()
|
||||
continue
|
||||
|
||||
alt = image.get('alt')
|
||||
if isinstance(alt, (list, tuple)):
|
||||
alt = alt[0]
|
||||
alt_text = (alt or '').strip()
|
||||
# Determine final src for relative paths
|
||||
if not (src.startswith('http://') or src.startswith('https://')
|
||||
or src.startswith('/') or src.startswith('img/')):
|
||||
filename = os.path.basename(src)
|
||||
src = f'img/{filename}' if filename else src
|
||||
image['src'] = src
|
||||
|
||||
if not alt_text:
|
||||
alt_text = os.path.splitext(os.path.basename(src))[
|
||||
0].replace('-', ' ').replace('_', ' ').strip()
|
||||
image['alt'] = alt_text
|
||||
|
||||
return str(soup)
|
||||
@@ -0,0 +1,54 @@
|
||||
from typing import TypedDict, List, Dict, Optional
|
||||
|
||||
|
||||
class Detail(TypedDict):
|
||||
title: str
|
||||
content: str
|
||||
|
||||
|
||||
class Card(TypedDict, total=False):
|
||||
title: str
|
||||
content: str
|
||||
details: List[Detail]
|
||||
|
||||
|
||||
class Section(TypedDict):
|
||||
title: str
|
||||
content: str
|
||||
cards: List[Card]
|
||||
|
||||
|
||||
class PageData(TypedDict):
|
||||
title: Optional[str]
|
||||
sections: List[Section]
|
||||
|
||||
|
||||
class PageMeta(TypedDict, total=False):
|
||||
title: str
|
||||
description: str
|
||||
keywords: str
|
||||
og_description: str
|
||||
favicon: str
|
||||
twitter_image: str
|
||||
og_image: str
|
||||
|
||||
|
||||
class PageEntry(TypedDict, total=False):
|
||||
page_title: str
|
||||
page_subtitle: str
|
||||
page_cta: str
|
||||
page_cta_url: str
|
||||
meta: PageMeta
|
||||
active: bool
|
||||
|
||||
|
||||
PagesDict = Dict[str, PageEntry]
|
||||
|
||||
|
||||
class ParserState(TypedDict):
|
||||
page: PageData
|
||||
current_section: Optional[Section]
|
||||
current_card: Optional[Card]
|
||||
current_detail: Optional[Detail]
|
||||
content_buffer: List[str]
|
||||
detail_buffer: List[str]
|
||||
+126
@@ -0,0 +1,126 @@
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, List, Any
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from lib.types import PagesDict
|
||||
|
||||
DEFAULT_TEMPLATE_DIR = 'templates'
|
||||
DEFAULT_OUTPUT_DIR = 'html'
|
||||
|
||||
|
||||
def get_template_files(template_dir: str = DEFAULT_TEMPLATE_DIR) -> List[str]:
|
||||
"""Return template filenames from the provided template directory."""
|
||||
if not os.path.exists(template_dir):
|
||||
return []
|
||||
return [
|
||||
name
|
||||
for name in os.listdir(template_dir)
|
||||
if name.endswith('.html') and not name.startswith('_')
|
||||
]
|
||||
|
||||
|
||||
def get_css_files(output_dir: str = DEFAULT_OUTPUT_DIR) -> List[str]:
|
||||
"""Return CSS file paths contained in the output directory's css folder."""
|
||||
css_dir = os.path.join(output_dir, 'css')
|
||||
if not os.path.exists(css_dir):
|
||||
return []
|
||||
return [
|
||||
os.path.join(css_dir, name)
|
||||
for name in os.listdir(css_dir)
|
||||
if name.endswith('.css')
|
||||
]
|
||||
|
||||
|
||||
def get_js_files(output_dir: str = DEFAULT_OUTPUT_DIR) -> List[str]:
|
||||
"""Return JavaScript file paths contained in the output directory's js folder."""
|
||||
js_dir = os.path.join(output_dir, 'js')
|
||||
if not os.path.exists(js_dir):
|
||||
return []
|
||||
return [
|
||||
os.path.join(js_dir, name)
|
||||
for name in os.listdir(js_dir)
|
||||
if name.endswith('.js')
|
||||
]
|
||||
|
||||
|
||||
def render_template(
|
||||
template_name: str,
|
||||
context: Dict[str, Any],
|
||||
template_dir: str = DEFAULT_TEMPLATE_DIR) -> str:
|
||||
"""Render a Jinja2 template with the provided context."""
|
||||
env = Environment(loader=FileSystemLoader(template_dir))
|
||||
template = env.get_template(template_name)
|
||||
return template.render(context)
|
||||
|
||||
|
||||
def set_active_page_by_url(pages: PagesDict, page_url: str) -> None:
|
||||
"""Mutate navigation dictionary to mark the active page."""
|
||||
for key, value in pages.items():
|
||||
# value is a PageEntry TypedDict
|
||||
value['active'] = key == page_url
|
||||
|
||||
|
||||
def minify_js(js: str) -> str:
|
||||
"""Minify JavaScript by removing comments and redundant whitespace."""
|
||||
js = re.sub(r'//.*?\n|/\*.*?\*/', '', js, flags=re.DOTALL)
|
||||
js = re.sub(r'\s+', ' ', js)
|
||||
js = re.sub(r';\s+', ';\n', js)
|
||||
js = re.sub(r'\n+', ' ', js)
|
||||
return js.strip()
|
||||
|
||||
|
||||
def minify_css(css: str) -> str:
|
||||
"""Minify CSS by removing comments and redundant whitespace."""
|
||||
css = re.sub(r'/\*.*?\*/', '', css, flags=re.DOTALL)
|
||||
css = re.sub(r'\s+', ' ', css)
|
||||
css = re.sub(r';\s+', ';\n', css)
|
||||
css = re.sub(r'\s+([\{\s])', r'\1', css)
|
||||
css = re.sub(r'\s+}', '}', css)
|
||||
css = re.sub(r'\n+', ' ', css)
|
||||
return css.strip()
|
||||
|
||||
|
||||
def minify_html(html: str) -> str:
|
||||
"""Minify HTML by removing comments and redundant whitespace."""
|
||||
html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
|
||||
html = re.sub(r'\s+', ' ', html)
|
||||
html = re.sub(r'>\s+<', '><', html)
|
||||
html = re.sub(r'<\s+', '<', html)
|
||||
html = re.sub(r'\s+</', '</', html)
|
||||
return html.strip()
|
||||
|
||||
|
||||
def css_minifier(output_dir: str = DEFAULT_OUTPUT_DIR) -> None:
|
||||
"""Minify all CSS files within the output directory."""
|
||||
for css_path in get_css_files(output_dir):
|
||||
with open(css_path, 'r', encoding='utf-8') as handle:
|
||||
content = handle.read()
|
||||
minified = minify_css(content)
|
||||
with open(css_path, 'w', encoding='utf-8') as handle:
|
||||
handle.write(minified)
|
||||
|
||||
|
||||
def js_minifier(output_dir: str = DEFAULT_OUTPUT_DIR) -> None:
|
||||
"""Minify all JavaScript files within the output directory."""
|
||||
for js_path in get_js_files(output_dir):
|
||||
with open(js_path, 'r', encoding='utf-8') as handle:
|
||||
content = handle.read()
|
||||
minified = minify_js(content)
|
||||
with open(js_path, 'w', encoding='utf-8') as handle:
|
||||
handle.write(minified)
|
||||
|
||||
|
||||
__all__ = [
|
||||
'DEFAULT_OUTPUT_DIR',
|
||||
'DEFAULT_TEMPLATE_DIR',
|
||||
'get_template_files',
|
||||
'get_css_files',
|
||||
'get_js_files',
|
||||
'render_template',
|
||||
'set_active_page_by_url',
|
||||
'minify_js',
|
||||
'minify_css',
|
||||
'minify_html',
|
||||
'css_minifier',
|
||||
'js_minifier',
|
||||
]
|
||||
Reference in New Issue
Block a user