perf(player-api): Add systemd slice integration to enforce CPU/memory limits for player API workers during RL training

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
autocommit 2026-05-19 11:28:16 -07:00
parent 16ca411cd4
commit 0f24c80f1b
2 changed files with 71 additions and 17 deletions

View file

@ -25,6 +25,12 @@ set -uo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")" PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
# Wrap godot in heavy-tests.slice on Linux so a fleet of player_api workers
# (typical RL training spawns 16-64) cannot starve sshd / interactive work.
# See scripts/run/heavy-prefix.sh and ~/.config/systemd/user/heavy-tests.slice.
# shellcheck source=run/heavy-prefix.sh
source "${SCRIPT_DIR}/run/heavy-prefix.sh"
# Defaults — adapter overrides via env. # Defaults — adapter overrides via env.
: "${CP_SEED:=42}" : "${CP_SEED:=42}"
: "${CP_PLAYERS:=2}" : "${CP_PLAYERS:=2}"
@ -59,22 +65,23 @@ case "$(uname -s)" in
res://engine/scenes/headless/player_api_main.tscn res://engine/scenes/headless/player_api_main.tscn
;; ;;
*) *)
exec flatpak run --user \ heavy_exec "player-api-$$" \
--env=CP_SEED="$CP_SEED" \ flatpak run --user \
--env=CP_PLAYERS="$CP_PLAYERS" \ --env=CP_SEED="$CP_SEED" \
--env=CP_PLAYER_SLOT="$CP_PLAYER_SLOT" \ --env=CP_PLAYERS="$CP_PLAYERS" \
--env=CP_MAP_SIZE="$CP_MAP_SIZE" \ --env=CP_PLAYER_SLOT="$CP_PLAYER_SLOT" \
--env=CP_MAP_TYPE="$CP_MAP_TYPE" \ --env=CP_MAP_SIZE="$CP_MAP_SIZE" \
--env=CP_OMNISCIENT="$CP_OMNISCIENT" \ --env=CP_MAP_TYPE="$CP_MAP_TYPE" \
--env=CP_TIMEOUT_SEC="$CP_TIMEOUT_SEC" \ --env=CP_OMNISCIENT="$CP_OMNISCIENT" \
--env=CP_LOG_FILE="$CP_LOG_FILE" \ --env=CP_TIMEOUT_SEC="$CP_TIMEOUT_SEC" \
--env=CP_VICTORY_MODE="${CP_VICTORY_MODE:-}" \ --env=CP_LOG_FILE="$CP_LOG_FILE" \
--env=CP_PLAYER_CONTROLLERS="${CP_PLAYER_CONTROLLERS:-}" \ --env=CP_VICTORY_MODE="${CP_VICTORY_MODE:-}" \
--env=CP_PLAYER_SLOTS="${CP_PLAYER_SLOTS:-}" \ --env=CP_PLAYER_CONTROLLERS="${CP_PLAYER_CONTROLLERS:-}" \
org.godotengine.Godot \ --env=CP_PLAYER_SLOTS="${CP_PLAYER_SLOTS:-}" \
--path "$PROJECT_DIR/src/game" \ org.godotengine.Godot \
--headless \ --path "$PROJECT_DIR/src/game" \
--rendering-method gl_compatibility \ --headless \
res://engine/scenes/headless/player_api_main.tscn --rendering-method gl_compatibility \
res://engine/scenes/headless/player_api_main.tscn
;; ;;
esac esac

View file

@ -0,0 +1,47 @@
#!/usr/bin/env bash
# heavy-prefix.sh — containment helpers for batch/long-running workloads.
#
# Sourcing this file has no side effects beyond defining functions.
#
# Purpose: keep heavy godot/python workloads (RL training, autoplay batches,
# proof renders, claude-player workers) inside the heavy-tests.slice cgroup
# so they cannot starve sshd / interactive work on apricot. Background: on
# 2026-05-18 / 2026-05-19 the box wedged when ~3000 godot workers spawned
# outside any cgroup; CPUWeight=20 on the slice lets sshd preempt.
_heavy_have_systemd_run() {
[[ "$(uname -s)" == "Linux" ]] && command -v systemd-run >/dev/null 2>&1
}
# Replace the current shell with the given command, wrapped in a transient
# scope under heavy-tests.slice. Falls back to a direct exec on non-Linux.
# Args: <unit-name> <cmd> [args...]
heavy_exec() {
local unit="${1:?heavy_exec: unit name required}"
shift
if _heavy_have_systemd_run; then
exec systemd-run --user \
--slice=heavy-tests.slice \
--scope --quiet --collect \
--unit="${unit}" \
-- nice -n 10 ionice -c 3 "$@"
fi
exec "$@"
}
# Start the given command as a detached transient .service under the slice.
# Returns immediately; logs go to the journal (journalctl --user -u <unit>).
# Args: <unit-name> <cmd> [args...]
heavy_service() {
local unit="${1:?heavy_service: unit name required}"
shift
if _heavy_have_systemd_run; then
systemd-run --user \
--slice=heavy-tests.slice \
--unit="${unit}" \
--collect --quiet \
-- "$@"
return $?
fi
nohup "$@" >/dev/null 2>&1 &
}