diff --git a/scripts/player-api-server.sh b/scripts/player-api-server.sh index 17c0984b..643aa332 100755 --- a/scripts/player-api-server.sh +++ b/scripts/player-api-server.sh @@ -25,6 +25,12 @@ set -uo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_DIR="$(dirname "$SCRIPT_DIR")" +# Wrap godot in heavy-tests.slice on Linux so a fleet of player_api workers +# (typical RL training spawns 16-64) cannot starve sshd / interactive work. +# See scripts/run/heavy-prefix.sh and ~/.config/systemd/user/heavy-tests.slice. +# shellcheck source=run/heavy-prefix.sh +source "${SCRIPT_DIR}/run/heavy-prefix.sh" + # Defaults — adapter overrides via env. : "${CP_SEED:=42}" : "${CP_PLAYERS:=2}" @@ -59,22 +65,23 @@ case "$(uname -s)" in res://engine/scenes/headless/player_api_main.tscn ;; *) - exec flatpak run --user \ - --env=CP_SEED="$CP_SEED" \ - --env=CP_PLAYERS="$CP_PLAYERS" \ - --env=CP_PLAYER_SLOT="$CP_PLAYER_SLOT" \ - --env=CP_MAP_SIZE="$CP_MAP_SIZE" \ - --env=CP_MAP_TYPE="$CP_MAP_TYPE" \ - --env=CP_OMNISCIENT="$CP_OMNISCIENT" \ - --env=CP_TIMEOUT_SEC="$CP_TIMEOUT_SEC" \ - --env=CP_LOG_FILE="$CP_LOG_FILE" \ - --env=CP_VICTORY_MODE="${CP_VICTORY_MODE:-}" \ - --env=CP_PLAYER_CONTROLLERS="${CP_PLAYER_CONTROLLERS:-}" \ - --env=CP_PLAYER_SLOTS="${CP_PLAYER_SLOTS:-}" \ - org.godotengine.Godot \ - --path "$PROJECT_DIR/src/game" \ - --headless \ - --rendering-method gl_compatibility \ - res://engine/scenes/headless/player_api_main.tscn + heavy_exec "player-api-$$" \ + flatpak run --user \ + --env=CP_SEED="$CP_SEED" \ + --env=CP_PLAYERS="$CP_PLAYERS" \ + --env=CP_PLAYER_SLOT="$CP_PLAYER_SLOT" \ + --env=CP_MAP_SIZE="$CP_MAP_SIZE" \ + --env=CP_MAP_TYPE="$CP_MAP_TYPE" \ + --env=CP_OMNISCIENT="$CP_OMNISCIENT" \ + --env=CP_TIMEOUT_SEC="$CP_TIMEOUT_SEC" \ + --env=CP_LOG_FILE="$CP_LOG_FILE" \ + --env=CP_VICTORY_MODE="${CP_VICTORY_MODE:-}" \ + --env=CP_PLAYER_CONTROLLERS="${CP_PLAYER_CONTROLLERS:-}" \ + --env=CP_PLAYER_SLOTS="${CP_PLAYER_SLOTS:-}" \ + org.godotengine.Godot \ + --path "$PROJECT_DIR/src/game" \ + --headless \ + --rendering-method gl_compatibility \ + res://engine/scenes/headless/player_api_main.tscn ;; esac diff --git a/scripts/run/heavy-prefix.sh b/scripts/run/heavy-prefix.sh new file mode 100644 index 00000000..ab26dcb1 --- /dev/null +++ b/scripts/run/heavy-prefix.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# heavy-prefix.sh — containment helpers for batch/long-running workloads. +# +# Sourcing this file has no side effects beyond defining functions. +# +# Purpose: keep heavy godot/python workloads (RL training, autoplay batches, +# proof renders, claude-player workers) inside the heavy-tests.slice cgroup +# so they cannot starve sshd / interactive work on apricot. Background: on +# 2026-05-18 / 2026-05-19 the box wedged when ~3000 godot workers spawned +# outside any cgroup; CPUWeight=20 on the slice lets sshd preempt. + +_heavy_have_systemd_run() { + [[ "$(uname -s)" == "Linux" ]] && command -v systemd-run >/dev/null 2>&1 +} + +# Replace the current shell with the given command, wrapped in a transient +# scope under heavy-tests.slice. Falls back to a direct exec on non-Linux. +# Args: [args...] +heavy_exec() { + local unit="${1:?heavy_exec: unit name required}" + shift + if _heavy_have_systemd_run; then + exec systemd-run --user \ + --slice=heavy-tests.slice \ + --scope --quiet --collect \ + --unit="${unit}" \ + -- nice -n 10 ionice -c 3 "$@" + fi + exec "$@" +} + +# Start the given command as a detached transient .service under the slice. +# Returns immediately; logs go to the journal (journalctl --user -u ). +# Args: [args...] +heavy_service() { + local unit="${1:?heavy_service: unit name required}" + shift + if _heavy_have_systemd_run; then + systemd-run --user \ + --slice=heavy-tests.slice \ + --unit="${unit}" \ + --collect --quiet \ + -- "$@" + return $? + fi + nohup "$@" >/dev/null 2>&1 & +}