allucanget.biz/tests/test_markdown_parser.py

import textwrap
import pytest
from bs4 import BeautifulSoup
from markdown import Markdown
from xml.etree.ElementTree import Element, SubElement
from lib.markdown_parser import (
    HeadingCollector,
    HeadingExtension,
    build_component_structure,
    markdown_to_html_lines,
    parse_markdown_file,
)


def test_h4_creates_card_details():
    md = textwrap.dedent("""
    # Page Title

    ## Section One

    ### Card One
    Card intro paragraph.

    #### Detail A
    Detail content line 1.
    - item 1
    - item 2

    #### Detail B
    Another detail paragraph with [a link](https://example.com).
    """)

    page = build_component_structure(md, "test.md")

    assert page['title'] == 'Page Title'
    assert len(page['sections']) == 1
    section = page['sections'][0]
    assert section['title'] == 'Section One'
    assert len(section['cards']) == 1
    card = section['cards'][0]
    assert card['title'] == 'Card One'
    # content should include the intro paragraph converted to HTML
    assert 'Card intro paragraph' in card['content']
    # details should be present
    assert 'details' in card
    assert len(card['details']) == 2
    assert card['details'][0]['title'] == 'Detail A'
    assert 'item 1' in card['details'][0]['content']
    assert card['details'][1]['title'] == 'Detail B'
    assert 'https://example.com' in card['details'][1]['content']


def test_section_content_preserves_lists_before_first_card():
    md = textwrap.dedent("""
    # Title

    ## Section One

    - Item A
    - Item B

    ### Card Title
    Card body
    """)

    page = build_component_structure(md, "test.md")

    section = page['sections'][0]
    assert '<ul>' in section['content']
    assert '<li>Item A</li>' in section['content']
    assert section['cards'][0]['title'] == 'Card Title'


def test_card_content_keeps_lists_when_section_changes():
    md = textwrap.dedent("""
    # Title

    ## Section One

    ### Card One
    - First
    - Second

    ## Section Two
    Details
    """)

    page = build_component_structure(md, "test.md")

    first_section = page['sections'][0]
    card = first_section['cards'][0]
    assert '<ul>' in card['content']
    assert '<li>First</li>' in card['content']


def test_card_body_renders_list_items():
    md = textwrap.dedent("""
    # Title

    ## Section

    ### Card
    - Alpha
    - Beta
    - Gamma
    """)

    page = build_component_structure(md, "test.md")

    card = page['sections'][0]['cards'][0]
    assert '<ul>' in card['content']
    assert '<li>Alpha</li>' in card['content']
    assert '<li>Gamma</li>' in card['content']


def test_external_links_add_target_attributes():
    md = textwrap.dedent("""
    # Title

    ## Section

    Visit [Example](https://example.com) now.
    """)

    page = build_component_structure(md, "test.md")

    section_html = page['sections'][0]['content']
    soup = BeautifulSoup(section_html, 'html.parser')
    anchor = soup.find('a')
    assert anchor is not None
    assert anchor['href'] == 'https://example.com'
    assert anchor.get('target') == '_blank'
    assert anchor.get('rel') == ['noopener', 'noreferrer']


def test_unsafe_links_are_neutralized():
    md = textwrap.dedent("""
    # Title

    ## Section

    [Bad](javascript:alert('xss'))
    """)

    page = build_component_structure(md, "test.md")

    section_html = page['sections'][0]['content']
    soup = BeautifulSoup(section_html, 'html.parser')
    anchor = soup.find('a')
    assert anchor is not None
    assert anchor['href'] == '#unsafe'
    assert 'target' not in anchor.attrs
    assert 'rel' not in anchor.attrs


def test_index_page_sections_and_headings_extracted():
    text = """
    # Title

    Main intro paragraph.

    ## Section 1

    Section 1 intro.

    - List item 1 a
    - List item 1 b

    ## Section 2

    Section 2 intro.

    - List item 2 a
    - List item 2 b

    """
    md = Markdown(extensions=[HeadingExtension()])
    md.convert(text)
    collector = md.treeprocessors['heading_collector']
    headings = [h for h in getattr(collector, 'headings', []) if 'level' in h]

    assert headings[0]['level'] == 1
    assert headings[0]['text'] == 'Title'
    assert headings[1]['level'] == 2
    assert headings[1]['text'] == 'Section 1'
    assert headings[2]['level'] == 2
    assert headings[2]['text'] == 'Section 2'


def test_index_page_main_intro_extracted():
    text = """
    # Title

    This is the main introduction paragraph for the index page.

    ## Section 1

    Section content.
    """
    md = Markdown(extensions=[HeadingExtension()])
    md.convert(text)
    collector = md.treeprocessors['heading_collector']
    main_intro = getattr(collector, 'main_intro', '')

    assert 'This is the main introduction paragraph for the index page.' in main_intro


def test_heading_extension_collects_headings_and_lists():
    md = Markdown(extensions=[HeadingExtension()])
    md.convert(textwrap.dedent("""
    # Title

    ## Section with **Bold** and [Link](#target)

    - Item A
    - [Item B](#b) and more
    """))

    collector = md.treeprocessors['heading_collector']
    headings = getattr(collector, 'headings', [])

    assert headings[0]['level'] == 1
    assert headings[0]['text'] == 'Title'
    assert headings[1]['level'] == 2
    assert headings[1]['text'] == 'Section with Bold and Link'
    list_entry = next(entry for entry in headings if entry.get('type') == 'ul')
    assert list_entry['items'][0] == 'Item A'
    assert 'Item B' in list_entry['items'][1]


def test_heading_collector_extract_text_handles_children():
    collector = HeadingCollector(Markdown())
    parent = Element('p')
    child = SubElement(parent, 'strong')
    child.text = 'Bold'
    child.tail = ' tail'
    nested = SubElement(child, 'em')
    nested.text = 'inner'

    text = collector._extract_text(parent)

    assert 'Bold' in text
    assert 'tail' in text
    assert 'inner' in text

    collector.headings = []
    collector._process_element(parent)
    assert collector.headings == []


def test_parse_markdown_file_sets_title_from_filename(tmp_path):
    file_path = tmp_path / 'sample_page.md'
    file_path.write_text('Just text without heading', encoding='utf-8')

    page = parse_markdown_file(str(file_path))

    assert page['title'] == 'Sample Page'


def test_parse_markdown_file_missing(tmp_path):
    missing = tmp_path / 'not_there.md'

    with pytest.raises(FileNotFoundError):
        parse_markdown_file(str(missing))


def test_markdown_to_html_lines_handles_empty_and_blank_links():
    assert markdown_to_html_lines('') == ''

    html = markdown_to_html_lines(
        'Before [Empty]() and [External](https://example.com)')
    soup = BeautifulSoup(html, 'html.parser')
    anchors = soup.find_all('a')

    assert anchors[0]['href'] == ''
    assert 'target' not in anchors[0].attrs
    assert anchors[1]['target'] == '_blank'


def test_markdown_images_normalize_src_and_alt():
    html = markdown_to_html_lines(
        '![IT Consulting](assets/icons/it-consulting.svg)')
    soup = BeautifulSoup(html, 'html.parser')
    image = soup.find('img')

    assert image is not None
    assert image['src'] == 'img/it-consulting.svg'
    assert image['alt'] == 'IT Consulting'


def test_markdown_images_add_alt_fallback():
    html = markdown_to_html_lines('![](logo.svg)')
    soup = BeautifulSoup(html, 'html.parser')
    image = soup.find('img')

    assert image is not None
    assert image['src'] == 'img/logo.svg'
    assert image['alt'] == 'logo'


def test_markdown_heading_with_inline_image_preserves_image():
    html = markdown_to_html_lines('## Heading ![Graphic](img/graphic.svg)')
    soup = BeautifulSoup(html, 'html.parser')
    heading = soup.find('h2')
    image = heading.find('img') if heading else None

    assert heading is not None
    assert heading.get_text(strip=True) == 'Heading'
    assert image is not None
    assert image['src'] == 'img/graphic.svg'
    assert image['alt'] == 'Graphic'


def test_build_component_structure_converts_section_images():
    md = textwrap.dedent("""
    # Title

    ## Section With Visual

    Intro text before image.

    ![Vision Diagram](assets/diagrams/vision.png)
    """)

    page = build_component_structure(md, "section.md")
    section_html = page['sections'][0]['content']
    soup = BeautifulSoup(section_html, 'html.parser')
    image = soup.find('img')

    assert image is not None
    assert image['src'] == 'img/vision.png'
    assert image['alt'] == 'Vision Diagram'


def test_build_component_structure_preserves_external_image_src():
    md = textwrap.dedent("""
    # Title

    ## Section

    ![Remote Logo](https://cdn.example.com/logo.svg)
    """)

    page = build_component_structure(md, "external.md")
    section_html = page['sections'][0]['content']
    soup = BeautifulSoup(section_html, 'html.parser')
    image = soup.find('img')

    assert image is not None
    assert image['src'] == 'https://cdn.example.com/logo.svg'
    assert image['alt'] == 'Remote Logo'