vllm.entrypoints.launcher ¶

_add_shutdown_handlers ¶

_add_shutdown_handlers(
    app: FastAPI, server: Server
) -> None

VLLM V1 AsyncLLM catches exceptions and returns only two types: EngineGenerateError and EngineDeadError.

EngineGenerateError is raised by the per request generate() method. This error could be request specific (and therefore recoverable - e.g. if there is an error in input processing).

EngineDeadError is raised by the background output_handler method. This error is global and therefore not recoverable.

We register these @app.exception_handlers to return nice responses to the end user if they occur and shut down if needed. See https://fastapi.tiangolo.com/tutorial/handling-errors/ for more details on how exception handlers work.

If an exception is encountered in a StreamingResponse generator, the exception is not raised, since we already sent a 200 status. Rather, we send an error message as the next chunk. Since the exception is not raised, this means that the server will not automatically shut down. Instead, we use the watchdog background task for check for errored state.

Source code in vllm/entrypoints/launcher.py

def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
    """
    VLLM V1 AsyncLLM catches exceptions and returns
    only two types: EngineGenerateError and EngineDeadError.

    EngineGenerateError is raised by the per request generate()
    method. This error could be request specific (and therefore
    recoverable - e.g. if there is an error in input processing).

    EngineDeadError is raised by the background output_handler
    method. This error is global and therefore not recoverable.

    We register these @app.exception_handlers to return nice
    responses to the end user if they occur and shut down if needed.
    See https://fastapi.tiangolo.com/tutorial/handling-errors/
    for more details on how exception handlers work.

    If an exception is encountered in a StreamingResponse
    generator, the exception is not raised, since we already sent
    a 200 status. Rather, we send an error message as the next chunk.
    Since the exception is not raised, this means that the server
    will not automatically shut down. Instead, we use the watchdog
    background task for check for errored state.
    """

    @app.exception_handler(RuntimeError)
    @app.exception_handler(EngineDeadError)
    @app.exception_handler(EngineGenerateError)
    async def runtime_exception_handler(request: Request, __):
        terminate_if_errored(
            server=server,
            engine=request.app.state.engine_client,
        )

        return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)

serve_http `async` ¶

serve_http(
    app: FastAPI,
    sock: socket | None,
    enable_ssl_refresh: bool = False,
    **uvicorn_kwargs: Any,
)

Start a FastAPI app using Uvicorn, with support for custom Uvicorn config options. Supports http header limits via h11_max_incomplete_event_size and h11_max_header_count.

Source code in vllm/entrypoints/launcher.py

async def serve_http(
    app: FastAPI,
    sock: socket.socket | None,
    enable_ssl_refresh: bool = False,
    **uvicorn_kwargs: Any,
):
    """
    Start a FastAPI app using Uvicorn, with support for custom Uvicorn config
    options.  Supports http header limits via h11_max_incomplete_event_size and
    h11_max_header_count.
    """
    logger.info("Available routes are:")
    # post endpoints
    for route in app.routes:
        methods = getattr(route, "methods", None)
        path = getattr(route, "path", None)

        if methods is None or path is None:
            continue

        logger.info("Route: %s, Methods: %s", path, ", ".join(methods))

    # other endpoints
    for route in app.routes:
        endpoint = getattr(route, "endpoint", None)
        methods = getattr(route, "methods", None)
        path = getattr(route, "path", None)

        if endpoint is None or path is None or methods is not None:
            continue

        logger.info("Route: %s, Endpoint: %s", path, endpoint.__name__)

    # Extract header limit options if present
    h11_max_incomplete_event_size = uvicorn_kwargs.pop(
        "h11_max_incomplete_event_size", None
    )
    h11_max_header_count = uvicorn_kwargs.pop("h11_max_header_count", None)

    # Set safe defaults if not provided
    if h11_max_incomplete_event_size is None:
        h11_max_incomplete_event_size = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
    if h11_max_header_count is None:
        h11_max_header_count = H11_MAX_HEADER_COUNT_DEFAULT

    args = getattr(app.state, "args", None)
    engine_client: EngineClient = app.state.engine_client
    enable_drain = (
        args is not None and getattr(args, "shutdown_mode", "immediate") == "drain"
    )

    config = uvicorn.Config(app, **uvicorn_kwargs)
    config.h11_max_incomplete_event_size = h11_max_incomplete_event_size
    config.h11_max_header_count = h11_max_header_count
    if enable_drain:
        config.install_signal_handlers = False
    config.load()
    server = uvicorn.Server(config)
    _add_shutdown_handlers(app, server)

    loop = asyncio.get_running_loop()

    watchdog_task = loop.create_task(watchdog_loop(server, engine_client))
    server_task = loop.create_task(server.serve(sockets=[sock] if sock else None))

    ssl_cert_refresher = (
        None
        if not enable_ssl_refresh
        else SSLCertRefresher(
            ssl_context=config.ssl,
            key_path=config.ssl_keyfile,
            cert_path=config.ssl_certfile,
            ca_path=config.ssl_ca_certs,
        )
    )

    async def perform_drain() -> None:
        """Drain in-flight requests before shutdown."""
        drain_timeout = getattr(
            args, "shutdown_drain_timeout", FrontendArgs.shutdown_drain_timeout
        )

        inflight_count = engine_client.get_num_unfinished_requests()
        logger.info(
            "Drain: draining %d in-flight requests (timeout: %ds)",
            inflight_count,
            drain_timeout,
        )

        set_rejecting_requests(True)
        await engine_client.drain(drain_timeout)

    def signal_handler() -> None:
        # prevents the uvicorn signal handler to exit early
        server_task.cancel()
        watchdog_task.cancel()
        if ssl_cert_refresher:
            ssl_cert_refresher.stop()

    async def drain_then_shutdown() -> None:
        """Drain in-flight requests then trigger shutdown."""
        await perform_drain()
        signal_handler()

    shutting_down = False
    drain_task: asyncio.Task | None = None

    def on_signal() -> None:
        """Signal callback that spawns the drain task."""
        nonlocal shutting_down, drain_task
        if shutting_down:
            logger.warning("Received second signal, forcing immediate shutdown")
            if drain_task is not None and not drain_task.done():
                drain_task.cancel()
            signal_handler()
            return
        shutting_down = True

        if enable_drain:
            logger.info(
                "Drain initiated. Send SIGTERM again to force immediate shutdown."
            )
            drain_task = loop.create_task(drain_then_shutdown())
        else:
            signal_handler()

    async def dummy_shutdown() -> None:
        pass

    loop.add_signal_handler(signal.SIGINT, on_signal)
    loop.add_signal_handler(signal.SIGTERM, on_signal)

    try:
        await server_task
        return dummy_shutdown()
    except asyncio.CancelledError:
        port = uvicorn_kwargs["port"]
        process = find_process_using_port(port)
        if process is not None:
            logger.warning(
                "port %s is used by process %s launched with command:\n%s",
                port,
                process,
                " ".join(process.cmdline()),
            )
        logger.info("Shutting down FastAPI HTTP server.")
        return server.shutdown()
    finally:
        watchdog_task.cancel()

terminate_if_errored ¶

terminate_if_errored(server: Server, engine: EngineClient)

See discussions here on shutting down a uvicorn server https://github.com/encode/uvicorn/discussions/1103 In this case we cannot await the server shutdown here because handler must first return to close the connection for this request.

Source code in vllm/entrypoints/launcher.py

def terminate_if_errored(server: uvicorn.Server, engine: EngineClient):
    """
    See discussions here on shutting down a uvicorn server
    https://github.com/encode/uvicorn/discussions/1103
    In this case we cannot await the server shutdown here
    because handler must first return to close the connection
    for this request.
    """
    engine_errored = engine.errored and not engine.is_running
    if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored:
        server.should_exit = True

watchdog_loop `async` ¶

watchdog_loop(server: Server, engine: EngineClient)

Watchdog task that runs in the background, checking¶

for error state in the engine. Needed to trigger shutdown¶

if an exception arises is StreamingResponse() generator.¶

Source code in vllm/entrypoints/launcher.py

async def watchdog_loop(server: uvicorn.Server, engine: EngineClient):
    """
    # Watchdog task that runs in the background, checking
    # for error state in the engine. Needed to trigger shutdown
    # if an exception arises is StreamingResponse() generator.
    """
    VLLM_WATCHDOG_TIME_S = 5.0
    while True:
        await asyncio.sleep(VLLM_WATCHDOG_TIME_S)
        terminate_if_errored(server, engine)

vllm.entrypoints.launcher ¶

_add_shutdown_handlers ¶

serve_http async ¶

terminate_if_errored ¶

watchdog_loop async ¶

Watchdog task that runs in the background, checking¶

for error state in the engine. Needed to trigger shutdown¶

if an exception arises is StreamingResponse() generator.¶

serve_http `async` ¶

watchdog_loop `async` ¶