From 446c432c186aecf36e65e119f527c8bb56c432fc Mon Sep 17 00:00:00 2001 From: zwitschi Date: Thu, 22 Jan 2026 16:55:18 +0100 Subject: [PATCH] fix: ensure scheduler starts only once during Flask requests --- tests/test_scheduler.py | 20 ++++++++++++++++++++ web/app.py | 23 ++++++++++++++++++++--- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index 38521cb..f792b65 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -153,3 +153,23 @@ class TestScheduler: assert search_data.get(("losangeles", "python")) == 4 assert search_data.get(("losangeles", "java")) == 1 assert result.get("discovered") == 10 # Total unique jobs + + +def test_app_scheduler_starts_once(monkeypatch): + """Ensure the Flask before_request hook starts scheduler only once.""" + import web.app as app_module + + monkeypatch.setenv("SCRAPE_SCHEDULER_ENABLED", "true") + monkeypatch.delenv("SERVER_SOFTWARE", raising=False) + monkeypatch.delenv("FLASK_RUN_FROM_CLI", raising=False) + monkeypatch.delenv("WERKZEUG_RUN_MAIN", raising=False) + + app_module._scheduler_started = False + + with patch("web.app.start_scheduler_in_background") as mock_start: + app_module.app.config.update(TESTING=True, WTF_CSRF_ENABLED=False) + with app_module.app.test_client() as client: + client.get("/health") + client.get("/health") + + assert mock_start.call_count == 1 diff --git a/web/app.py b/web/app.py index 49ce029..a84358f 100644 --- a/web/app.py +++ b/web/app.py @@ -1,4 +1,5 @@ import os +import threading from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash, Response from flask_wtf import CSRFProtect from typing import Dict, List @@ -60,6 +61,10 @@ app.static_folder = "static" csrf = CSRFProtect(app) +_scheduler_started = False +_scheduler_lock = threading.Lock() + + def _scheduler_enabled() -> bool: flag = (os.environ.get("SCRAPE_SCHEDULER_ENABLED") or "").strip().lower() if flag not in {"1", "true", "yes", "on"}: @@ -74,10 +79,22 @@ def _scheduler_enabled() -> bool: return True -@app.before_first_request -def _start_scheduler_if_enabled(): - if _scheduler_enabled(): +def _maybe_start_scheduler() -> None: + global _scheduler_started + if _scheduler_started: + return + if not _scheduler_enabled(): + return + with _scheduler_lock: + if _scheduler_started: + return start_scheduler_in_background() + _scheduler_started = True + + +@app.before_request +def _start_scheduler_if_enabled(): + _maybe_start_scheduler() def require_admin():