perf(player-api): ⚡ Add systemd slice integration to enforce CPU/memory limits for player API workers during RL training
Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
parent
16ca411cd4
commit
0f24c80f1b
2 changed files with 71 additions and 17 deletions
|
|
@ -25,6 +25,12 @@ set -uo pipefail
|
|||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
|
||||
# Wrap godot in heavy-tests.slice on Linux so a fleet of player_api workers
|
||||
# (typical RL training spawns 16-64) cannot starve sshd / interactive work.
|
||||
# See scripts/run/heavy-prefix.sh and ~/.config/systemd/user/heavy-tests.slice.
|
||||
# shellcheck source=run/heavy-prefix.sh
|
||||
source "${SCRIPT_DIR}/run/heavy-prefix.sh"
|
||||
|
||||
# Defaults — adapter overrides via env.
|
||||
: "${CP_SEED:=42}"
|
||||
: "${CP_PLAYERS:=2}"
|
||||
|
|
@ -59,22 +65,23 @@ case "$(uname -s)" in
|
|||
res://engine/scenes/headless/player_api_main.tscn
|
||||
;;
|
||||
*)
|
||||
exec flatpak run --user \
|
||||
--env=CP_SEED="$CP_SEED" \
|
||||
--env=CP_PLAYERS="$CP_PLAYERS" \
|
||||
--env=CP_PLAYER_SLOT="$CP_PLAYER_SLOT" \
|
||||
--env=CP_MAP_SIZE="$CP_MAP_SIZE" \
|
||||
--env=CP_MAP_TYPE="$CP_MAP_TYPE" \
|
||||
--env=CP_OMNISCIENT="$CP_OMNISCIENT" \
|
||||
--env=CP_TIMEOUT_SEC="$CP_TIMEOUT_SEC" \
|
||||
--env=CP_LOG_FILE="$CP_LOG_FILE" \
|
||||
--env=CP_VICTORY_MODE="${CP_VICTORY_MODE:-}" \
|
||||
--env=CP_PLAYER_CONTROLLERS="${CP_PLAYER_CONTROLLERS:-}" \
|
||||
--env=CP_PLAYER_SLOTS="${CP_PLAYER_SLOTS:-}" \
|
||||
org.godotengine.Godot \
|
||||
--path "$PROJECT_DIR/src/game" \
|
||||
--headless \
|
||||
--rendering-method gl_compatibility \
|
||||
res://engine/scenes/headless/player_api_main.tscn
|
||||
heavy_exec "player-api-$$" \
|
||||
flatpak run --user \
|
||||
--env=CP_SEED="$CP_SEED" \
|
||||
--env=CP_PLAYERS="$CP_PLAYERS" \
|
||||
--env=CP_PLAYER_SLOT="$CP_PLAYER_SLOT" \
|
||||
--env=CP_MAP_SIZE="$CP_MAP_SIZE" \
|
||||
--env=CP_MAP_TYPE="$CP_MAP_TYPE" \
|
||||
--env=CP_OMNISCIENT="$CP_OMNISCIENT" \
|
||||
--env=CP_TIMEOUT_SEC="$CP_TIMEOUT_SEC" \
|
||||
--env=CP_LOG_FILE="$CP_LOG_FILE" \
|
||||
--env=CP_VICTORY_MODE="${CP_VICTORY_MODE:-}" \
|
||||
--env=CP_PLAYER_CONTROLLERS="${CP_PLAYER_CONTROLLERS:-}" \
|
||||
--env=CP_PLAYER_SLOTS="${CP_PLAYER_SLOTS:-}" \
|
||||
org.godotengine.Godot \
|
||||
--path "$PROJECT_DIR/src/game" \
|
||||
--headless \
|
||||
--rendering-method gl_compatibility \
|
||||
res://engine/scenes/headless/player_api_main.tscn
|
||||
;;
|
||||
esac
|
||||
|
|
|
|||
47
scripts/run/heavy-prefix.sh
Normal file
47
scripts/run/heavy-prefix.sh
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
#!/usr/bin/env bash
|
||||
# heavy-prefix.sh — containment helpers for batch/long-running workloads.
|
||||
#
|
||||
# Sourcing this file has no side effects beyond defining functions.
|
||||
#
|
||||
# Purpose: keep heavy godot/python workloads (RL training, autoplay batches,
|
||||
# proof renders, claude-player workers) inside the heavy-tests.slice cgroup
|
||||
# so they cannot starve sshd / interactive work on apricot. Background: on
|
||||
# 2026-05-18 / 2026-05-19 the box wedged when ~3000 godot workers spawned
|
||||
# outside any cgroup; CPUWeight=20 on the slice lets sshd preempt.
|
||||
|
||||
_heavy_have_systemd_run() {
|
||||
[[ "$(uname -s)" == "Linux" ]] && command -v systemd-run >/dev/null 2>&1
|
||||
}
|
||||
|
||||
# Replace the current shell with the given command, wrapped in a transient
|
||||
# scope under heavy-tests.slice. Falls back to a direct exec on non-Linux.
|
||||
# Args: <unit-name> <cmd> [args...]
|
||||
heavy_exec() {
|
||||
local unit="${1:?heavy_exec: unit name required}"
|
||||
shift
|
||||
if _heavy_have_systemd_run; then
|
||||
exec systemd-run --user \
|
||||
--slice=heavy-tests.slice \
|
||||
--scope --quiet --collect \
|
||||
--unit="${unit}" \
|
||||
-- nice -n 10 ionice -c 3 "$@"
|
||||
fi
|
||||
exec "$@"
|
||||
}
|
||||
|
||||
# Start the given command as a detached transient .service under the slice.
|
||||
# Returns immediately; logs go to the journal (journalctl --user -u <unit>).
|
||||
# Args: <unit-name> <cmd> [args...]
|
||||
heavy_service() {
|
||||
local unit="${1:?heavy_service: unit name required}"
|
||||
shift
|
||||
if _heavy_have_systemd_run; then
|
||||
systemd-run --user \
|
||||
--slice=heavy-tests.slice \
|
||||
--unit="${unit}" \
|
||||
--collect --quiet \
|
||||
-- "$@"
|
||||
return $?
|
||||
fi
|
||||
nohup "$@" >/dev/null 2>&1 &
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue