Add OSM Track Harvesting Policy and demo database initialization script

- Updated documentation to include OSM Track Harvesting Policy with details on railway types, service filters, usage filters, and geometry guardrails. - Introduced a new script `init_demo_db.py` to automate the database setup process, including environment checks, running migrations, and loading OSM fixtures for demo data.
2025-10-11 21:37:25 +02:00
parent 0b84ee953e
commit 25ca7ab196
9 changed files with 537737 additions and 18 deletions
--- a/backend/app/core/osm_config.py
+++ b/backend/app/core/osm_config.py
@@ -76,17 +76,42 @@ STATION_TAG_FILTERS: Mapping[str, Tuple[str, ...]] = {


 # Tags that describe rail infrastructure usable for train routing.
+TRACK_ALLOWED_RAILWAY_TYPES: Tuple[str, ...] = (
+    "rail",
+    "light_rail",
+    "subway",
+    "tram",
+    "narrow_gauge",
+    "disused",
+    "construction",
+)
+
+
 TRACK_TAG_FILTERS: Mapping[str, Tuple[str, ...]] = {
-    "railway": (
-        "rail",
-        "light_rail",
-        "subway",
-        "tram",
-        "narrow_gauge",
-    ),
+    "railway": TRACK_ALLOWED_RAILWAY_TYPES,
 }


+# Track ingestion policy
+TRACK_EXCLUDED_SERVICE_TAGS: Tuple[str, ...] = (
+    "yard",
+    "siding",
+    "spur",
+    "crossover",
+    "industrial",
+    "military",
+)
+
+TRACK_EXCLUDED_USAGE_TAGS: Tuple[str, ...] = (
+    "military",
+    "tourism",
+)
+
+TRACK_MIN_LENGTH_METERS: float = 75.0
+
+TRACK_STATION_SNAP_RADIUS_METERS: float = 350.0
+
+
 def compile_overpass_filters(filters: Mapping[str, Iterable[str]]) -> str:
    """Build an Overpass boolean expression that matches the provided filters."""

@@ -101,6 +126,11 @@ __all__ = [
    "BoundingBox",
    "DEFAULT_REGIONS",
    "STATION_TAG_FILTERS",
+    "TRACK_ALLOWED_RAILWAY_TYPES",
    "TRACK_TAG_FILTERS",
+    "TRACK_EXCLUDED_SERVICE_TAGS",
+    "TRACK_EXCLUDED_USAGE_TAGS",
+    "TRACK_MIN_LENGTH_METERS",
+    "TRACK_STATION_SNAP_RADIUS_METERS",
    "compile_overpass_filters",
 ]
--- a/backend/scripts/tracks_import.py
+++ b/backend/scripts/tracks_import.py
@@ -8,11 +8,15 @@ import math
 import sys
 from dataclasses import asdict
 from pathlib import Path
-from typing import Any, Iterable
+from typing import Any, Iterable, Mapping
 from urllib.parse import quote_plus

 from backend.app.core.osm_config import (
    DEFAULT_REGIONS,
+    TRACK_ALLOWED_RAILWAY_TYPES,
+    TRACK_EXCLUDED_SERVICE_TAGS,
+    TRACK_EXCLUDED_USAGE_TAGS,
+    TRACK_MIN_LENGTH_METERS,
    TRACK_TAG_FILTERS,
    compile_overpass_filters,
 )
@@ -104,13 +108,15 @@ def normalize_track_elements(elements: Iterable[dict[str, Any]]) -> list[dict[st
            continue

        tags: dict[str, Any] = element.get("tags", {})
+        length_meters = _polyline_length(coordinates)
+        if not _should_include_track(tags, length_meters):
+            continue
+
        name = tags.get("name")
        maxspeed = _parse_maxspeed(tags.get("maxspeed"))
        status = _derive_status(tags.get("railway"))
        is_bidirectional = not _is_oneway(tags.get("oneway"))

-        length_meters = _polyline_length(coordinates)
-
        tracks.append(
            {
                "osmId": str(element.get("id")),
@@ -156,6 +162,25 @@ def _derive_status(value: Any) -> str:
    return "operational"


+def _should_include_track(tags: Mapping[str, Any], length_meters: float) -> bool:
+    railway = str(tags.get("railway", "")).lower()
+    if railway not in TRACK_ALLOWED_RAILWAY_TYPES:
+        return False
+
+    if length_meters < TRACK_MIN_LENGTH_METERS:
+        return False
+
+    service = str(tags.get("service", "")).lower()
+    if service and service in TRACK_EXCLUDED_SERVICE_TAGS:
+        return False
+
+    usage = str(tags.get("usage", "")).lower()
+    if usage and usage in TRACK_EXCLUDED_USAGE_TAGS:
+        return False
+
+    return True
+
+
 def _is_oneway(value: Any) -> bool:
    if value is None:
        return False
--- a/backend/scripts/tracks_load.py
+++ b/backend/scripts/tracks_load.py
@@ -13,6 +13,7 @@ from typing import Any, Iterable, Mapping, Sequence
 from geoalchemy2.elements import WKBElement, WKTElement
 from geoalchemy2.shape import to_shape

+from backend.app.core.osm_config import TRACK_STATION_SNAP_RADIUS_METERS
 from backend.app.db.session import SessionLocal
 from backend.app.models import TrackCreate
 from backend.app.repositories import StationRepository, TrackRepository
@@ -133,26 +134,40 @@ def load_tracks(tracks: Iterable[ParsedTrack], commit: bool = True) -> int:

        for track_data in tracks:
            start_station = _nearest_station(
-                track_data.coordinates[0], station_index)
+                track_data.coordinates[0],
+                station_index,
+                TRACK_STATION_SNAP_RADIUS_METERS,
+            )
            end_station = _nearest_station(
-                track_data.coordinates[-1], station_index)
+                track_data.coordinates[-1],
+                station_index,
+                TRACK_STATION_SNAP_RADIUS_METERS,
+            )

            if not start_station or not end_station:
                continue

+            if start_station.id == end_station.id:
+                continue
+
            pair = (start_station.id, end_station.id)
            if pair in existing_pairs:
                continue

            length = track_data.length_meters or _polyline_length(
                track_data.coordinates)
+            max_speed = (
+                int(round(track_data.max_speed_kph))
+                if track_data.max_speed_kph is not None
+                else None
+            )
            create_schema = TrackCreate(
                name=track_data.name,
                start_station_id=start_station.id,
                end_station_id=end_station.id,
                coordinates=track_data.coordinates,
                length_meters=length,
-                max_speed_kph=track_data.max_speed_kph,
+                max_speed_kph=max_speed,
                status=track_data.status,
                is_bidirectional=track_data.is_bidirectional,
            )
@@ -170,7 +185,9 @@ def load_tracks(tracks: Iterable[ParsedTrack], commit: bool = True) -> int:


 def _nearest_station(
-    coordinate: tuple[float, float], stations: Sequence[StationRef]
+    coordinate: tuple[float, float],
+    stations: Sequence[StationRef],
+    max_distance_meters: float,
 ) -> StationRef | None:
    best_station: StationRef | None = None
    best_distance = math.inf
@@ -180,7 +197,9 @@ def _nearest_station(
        if distance < best_distance:
            best_station = station
            best_distance = distance
-    return best_station
+    if best_distance <= max_distance_meters:
+        return best_station
+    return None


 def _build_station_index(stations: Iterable[Any]) -> list[StationRef]:
@@ -192,11 +211,15 @@ def _build_station_index(stations: Iterable[Any]) -> list[StationRef]:
        point = _to_point(location)
        if point is None:
            continue
+        latitude = getattr(point, "y", None)
+        longitude = getattr(point, "x", None)
+        if latitude is None or longitude is None:
+            continue
        index.append(
            StationRef(
                id=str(station.id),
-                latitude=float(point.y),
-                longitude=float(point.x),
+                latitude=float(latitude),
+                longitude=float(longitude),
            )
        )
    return index
--- a/backend/tests/test_tracks_import.py
+++ b/backend/tests/test_tracks_import.py
@@ -67,3 +67,44 @@ def test_normalize_track_elements_marks_oneway_and_status() -> None:
    track = tracks[0]
    assert track["status"] == "disused"
    assert track["isBidirectional"] is False
+
+
+def test_normalize_track_elements_skips_service_tracks() -> None:
+    elements = [
+        {
+            "type": "way",
+            "id": 77,
+            "geometry": [
+                {"lat": 52.5000, "lon": 13.4000},
+                {"lat": 52.5010, "lon": 13.4010},
+            ],
+            "tags": {
+                "railway": "rail",
+                "service": "yard",
+            },
+        }
+    ]
+
+    tracks = tracks_import.normalize_track_elements(elements)
+
+    assert tracks == []
+
+
+def test_normalize_track_elements_skips_short_tracks() -> None:
+    elements = [
+        {
+            "type": "way",
+            "id": 81,
+            "geometry": [
+                {"lat": 52.500000, "lon": 13.400000},
+                {"lat": 52.500100, "lon": 13.400050},
+            ],
+            "tags": {
+                "railway": "rail",
+            },
+        }
+    ]
+
+    tracks = tracks_import.normalize_track_elements(elements)
+
+    assert tracks == []
--- a/backend/tests/test_tracks_load.py
+++ b/backend/tests/test_tracks_load.py
@@ -166,3 +166,35 @@ def test_load_tracks_skips_existing_pairs(monkeypatch: pytest.MonkeyPatch) -> No
    assert created == 0
    assert session_instance.rolled_back is True
    assert not track_repo_instance.created
+
+
+def test_load_tracks_skips_when_station_too_far(monkeypatch: pytest.MonkeyPatch) -> None:
+    session_instance = DummySession()
+    station_repo_instance = DummyStationRepository(
+        session_instance,
+        stations=[
+            DummyStation(id="remote-station", location=_point(53.5, 14.5)),
+        ],
+    )
+    track_repo_instance = DummyTrackRepository(session_instance)
+
+    monkeypatch.setattr(tracks_load, "SessionLocal", lambda: session_instance)
+    monkeypatch.setattr(tracks_load, "StationRepository",
+                        lambda session: station_repo_instance)
+    monkeypatch.setattr(tracks_load, "TrackRepository",
+                        lambda session: track_repo_instance)
+
+    parsed = tracks_load._parse_track_entries(
+        [
+            {
+                "name": "Isolated Segment",
+                "coordinates": [[52.5, 13.4], [52.51, 13.41]],
+            }
+        ]
+    )
+
+    created = tracks_load.load_tracks(parsed, commit=True)
+
+    assert created == 0
+    assert session_instance.committed is True
+    assert not track_repo_instance.created
--- a/data/osm_stations.json
+++ b/data/osm_stations.json
--- a/data/osm_tracks.json
+++ b/data/osm_tracks.json
--- a/docs/08_Concepts.md
+++ b/docs/08_Concepts.md
@@ -55,6 +55,13 @@ Dynamic simulation of train operations:
 - **Fallback Mechanisms**: Polling as alternative when WebSockets unavailable
 - **Event-Driven Updates**: Push notifications for game state changes

+#### 8.2.4 OSM Track Harvesting Policy
+
+- **Railway Types**: Importer requests `rail`, `light_rail`, `subway`, `tram`, `narrow_gauge`, plus `construction` and `disused` variants to capture build-state metadata.
+- **Service Filters**: `service` tags such as `yard`, `siding`, `spur`, `crossover`, `industrial`, or `military` are excluded to focus on mainline traffic.
+- **Usage Filters**: Ways flagged with `usage=military` or `usage=tourism` are skipped; unspecified usage defaults to accepted.
+- **Geometry Guardrails**: Segments shorter than 75 meters are discarded and track endpoints must snap to an existing station within 350 meters or the segment is ignored during loading.
+
 ### 8.3 User Interface Concepts

 #### 8.3.1 Component-Based Architecture
@@ -127,4 +134,3 @@ Dynamic simulation of train operations:
 - **Lazy Loading**: On-demand loading of components and data
 - **Caching Layers**: Redis for frequently accessed data
 - **Asset Optimization**: Minification and compression of static resources
-
--- a/scripts/init_demo_db.py
+++ b/scripts/init_demo_db.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+"""
+Initialize the database with demo data for the Rail Game.
+
+This script automates the database setup process:
+1. Validates environment setup
+2. Runs database migrations
+3. Loads OSM fixtures for demo data
+
+Usage:
+    python scripts/init_demo_db.py [--dry-run] [--region REGION]
+
+Requirements:
+- Virtual environment activated
+- .env file configured with DATABASE_URL
+- PostgreSQL with PostGIS running
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    print("WARNING: python-dotenv not installed. .env file will not be loaded automatically.")
+    print("Install with: pip install python-dotenv")
+
+
+def check_virtualenv():
+    """Check if we're running in a virtual environment."""
+    if not hasattr(sys, 'real_prefix') and not (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix):
+        print("ERROR: Virtual environment not activated. Run:")
+        print("  .venv\\Scripts\\Activate.ps1  (PowerShell)")
+        print("  source .venv/bin/activate     (Bash/macOS/Linux)")
+        sys.exit(1)
+
+
+def check_env_file():
+    """Check if .env file exists."""
+    env_file = Path('.env')
+    if not env_file.exists():
+        print("ERROR: .env file not found. Copy .env.example to .env and configure:")
+        print("  Copy-Item .env.example .env  (PowerShell)")
+        print("  cp .env.example .env          (Bash)")
+        sys.exit(1)
+
+
+def check_database_url():
+    """Check if DATABASE_URL is set in environment."""
+    database_url = os.getenv('DATABASE_URL')
+    if not database_url:
+        print("ERROR: DATABASE_URL not set. Check your .env file.")
+        sys.exit(1)
+    print(f"Using database: {database_url}")
+
+
+def run_command(cmd, cwd=None, description=""):
+    """Run a shell command and return the result."""
+    print(f"\n>>> {description}")
+    print(f"Running: {' '.join(cmd)}")
+    try:
+        result = subprocess.run(cmd, cwd=cwd, check=True,
+                                capture_output=True, text=True)
+        if result.stdout:
+            print(result.stdout)
+        return result
+    except subprocess.CalledProcessError as e:
+        print(f"ERROR: Command failed with exit code {e.returncode}")
+        if e.stdout:
+            print(e.stdout)
+        if e.stderr:
+            print(e.stderr)
+        sys.exit(1)
+
+
+def run_migrations():
+    """Run database migrations using alembic."""
+    run_command(
+        ['alembic', 'upgrade', 'head'],
+        cwd='backend',
+        description="Running database migrations"
+    )
+
+
+def load_osm_fixtures(region, dry_run=False):
+    """Load OSM fixtures for demo data."""
+    cmd = ['python', '-m', 'backend.scripts.osm_refresh', '--region', region]
+    if dry_run:
+        cmd.append('--no-commit')
+        description = f"Loading OSM fixtures (dry run) for region: {region}"
+    else:
+        description = f"Loading OSM fixtures for region: {region}"
+
+    run_command(cmd, description=description)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Initialize database with demo data")
+    parser.add_argument(
+        '--region',
+        default='all',
+        help='OSM region to load (default: all)'
+    )
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='Dry run: run migrations and load fixtures without committing'
+    )
+    parser.add_argument(
+        '--skip-migrations',
+        action='store_true',
+        help='Skip running migrations'
+    )
+    parser.add_argument(
+        '--skip-fixtures',
+        action='store_true',
+        help='Skip loading OSM fixtures'
+    )
+
+    args = parser.parse_args()
+
+    print("Rail Game Database Initialization")
+    print("=" * 40)
+
+    # Pre-flight checks
+    check_virtualenv()
+    check_env_file()
+    check_database_url()
+
+    # Run migrations
+    if not args.skip_migrations:
+        run_migrations()
+    else:
+        print("Skipping migrations (--skip-migrations)")
+
+    # Load fixtures
+    if not args.skip_fixtures:
+        load_osm_fixtures(args.region, args.dry_run)
+    else:
+        print("Skipping fixtures (--skip-fixtures)")
+
+    print("\n✅ Database initialization completed successfully!")
+    if args.dry_run:
+        print("Note: This was a dry run. No data was committed to the database.")
+    else:
+        print("Demo data loaded. You can now start the backend server.")
+
+
+if __name__ == '__main__':
+    main()