perf(player-api): ⚡ Add systemd slice integration to enforce CPU/memory limits for player API workers during RL training
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
16ca411cd4
commit
0f24c80f1b
2 changed files with 71 additions and 17 deletions
|
|
@ -25,6 +25,12 @@ set -uo pipefail
|
||||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||||
|
|
||||||
|
# Wrap godot in heavy-tests.slice on Linux so a fleet of player_api workers
|
||||||
|
# (typical RL training spawns 16-64) cannot starve sshd / interactive work.
|
||||||
|
# See scripts/run/heavy-prefix.sh and ~/.config/systemd/user/heavy-tests.slice.
|
||||||
|
# shellcheck source=run/heavy-prefix.sh
|
||||||
|
source "${SCRIPT_DIR}/run/heavy-prefix.sh"
|
||||||
|
|
||||||
# Defaults — adapter overrides via env.
|
# Defaults — adapter overrides via env.
|
||||||
: "${CP_SEED:=42}"
|
: "${CP_SEED:=42}"
|
||||||
: "${CP_PLAYERS:=2}"
|
: "${CP_PLAYERS:=2}"
|
||||||
|
|
@ -59,22 +65,23 @@ case "$(uname -s)" in
|
||||||
res://engine/scenes/headless/player_api_main.tscn
|
res://engine/scenes/headless/player_api_main.tscn
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
exec flatpak run --user \
|
heavy_exec "player-api-$$" \
|
||||||
--env=CP_SEED="$CP_SEED" \
|
flatpak run --user \
|
||||||
--env=CP_PLAYERS="$CP_PLAYERS" \
|
--env=CP_SEED="$CP_SEED" \
|
||||||
--env=CP_PLAYER_SLOT="$CP_PLAYER_SLOT" \
|
--env=CP_PLAYERS="$CP_PLAYERS" \
|
||||||
--env=CP_MAP_SIZE="$CP_MAP_SIZE" \
|
--env=CP_PLAYER_SLOT="$CP_PLAYER_SLOT" \
|
||||||
--env=CP_MAP_TYPE="$CP_MAP_TYPE" \
|
--env=CP_MAP_SIZE="$CP_MAP_SIZE" \
|
||||||
--env=CP_OMNISCIENT="$CP_OMNISCIENT" \
|
--env=CP_MAP_TYPE="$CP_MAP_TYPE" \
|
||||||
--env=CP_TIMEOUT_SEC="$CP_TIMEOUT_SEC" \
|
--env=CP_OMNISCIENT="$CP_OMNISCIENT" \
|
||||||
--env=CP_LOG_FILE="$CP_LOG_FILE" \
|
--env=CP_TIMEOUT_SEC="$CP_TIMEOUT_SEC" \
|
||||||
--env=CP_VICTORY_MODE="${CP_VICTORY_MODE:-}" \
|
--env=CP_LOG_FILE="$CP_LOG_FILE" \
|
||||||
--env=CP_PLAYER_CONTROLLERS="${CP_PLAYER_CONTROLLERS:-}" \
|
--env=CP_VICTORY_MODE="${CP_VICTORY_MODE:-}" \
|
||||||
--env=CP_PLAYER_SLOTS="${CP_PLAYER_SLOTS:-}" \
|
--env=CP_PLAYER_CONTROLLERS="${CP_PLAYER_CONTROLLERS:-}" \
|
||||||
org.godotengine.Godot \
|
--env=CP_PLAYER_SLOTS="${CP_PLAYER_SLOTS:-}" \
|
||||||
--path "$PROJECT_DIR/src/game" \
|
org.godotengine.Godot \
|
||||||
--headless \
|
--path "$PROJECT_DIR/src/game" \
|
||||||
--rendering-method gl_compatibility \
|
--headless \
|
||||||
res://engine/scenes/headless/player_api_main.tscn
|
--rendering-method gl_compatibility \
|
||||||
|
res://engine/scenes/headless/player_api_main.tscn
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
|
||||||
47
scripts/run/heavy-prefix.sh
Normal file
47
scripts/run/heavy-prefix.sh
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# heavy-prefix.sh — containment helpers for batch/long-running workloads.
|
||||||
|
#
|
||||||
|
# Sourcing this file has no side effects beyond defining functions.
|
||||||
|
#
|
||||||
|
# Purpose: keep heavy godot/python workloads (RL training, autoplay batches,
|
||||||
|
# proof renders, claude-player workers) inside the heavy-tests.slice cgroup
|
||||||
|
# so they cannot starve sshd / interactive work on apricot. Background: on
|
||||||
|
# 2026-05-18 / 2026-05-19 the box wedged when ~3000 godot workers spawned
|
||||||
|
# outside any cgroup; CPUWeight=20 on the slice lets sshd preempt.
|
||||||
|
|
||||||
|
_heavy_have_systemd_run() {
|
||||||
|
[[ "$(uname -s)" == "Linux" ]] && command -v systemd-run >/dev/null 2>&1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Replace the current shell with the given command, wrapped in a transient
|
||||||
|
# scope under heavy-tests.slice. Falls back to a direct exec on non-Linux.
|
||||||
|
# Args: <unit-name> <cmd> [args...]
|
||||||
|
heavy_exec() {
|
||||||
|
local unit="${1:?heavy_exec: unit name required}"
|
||||||
|
shift
|
||||||
|
if _heavy_have_systemd_run; then
|
||||||
|
exec systemd-run --user \
|
||||||
|
--slice=heavy-tests.slice \
|
||||||
|
--scope --quiet --collect \
|
||||||
|
--unit="${unit}" \
|
||||||
|
-- nice -n 10 ionice -c 3 "$@"
|
||||||
|
fi
|
||||||
|
exec "$@"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Start the given command as a detached transient .service under the slice.
|
||||||
|
# Returns immediately; logs go to the journal (journalctl --user -u <unit>).
|
||||||
|
# Args: <unit-name> <cmd> [args...]
|
||||||
|
heavy_service() {
|
||||||
|
local unit="${1:?heavy_service: unit name required}"
|
||||||
|
shift
|
||||||
|
if _heavy_have_systemd_run; then
|
||||||
|
systemd-run --user \
|
||||||
|
--slice=heavy-tests.slice \
|
||||||
|
--unit="${unit}" \
|
||||||
|
--collect --quiet \
|
||||||
|
-- "$@"
|
||||||
|
return $?
|
||||||
|
fi
|
||||||
|
nohup "$@" >/dev/null 2>&1 &
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue