fix(scripts): 🐛 Fix false positives in container liveness checks and batch status reporting logic

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
autocommit 2026-05-27 11:40:48 -07:00
parent 619dafbdcc
commit 2d7357550e

View file

@ -334,25 +334,32 @@ if [[ "${MODE}" == "status" ]]; then
# Single ssh probe with short ConnectTimeout. Four lightweight queries:
# 1. systemctl --user is-active <unit> (active|inactive|failed|unknown)
# `|| true` because is-active exits non-zero on inactive/failed/unknown
# units and we want the stdout string, not the exit code. The previous
# `|| echo unknown` appended a second line that broke the single-line
# `|`-printf contract → `read -r` left every other field empty →
# false-positive `failed` after a clean batch.
# 2. count of completion.marker files under <stamp>/*/
# 3. count of turn_stats.jsonl files under <stamp>/*/game_*/
# 4. count of live godot processes for THIS batch stamp
# 4. container liveness for THIS batch stamp.
# Pre-docker-wrapper era counted live godot procs via pgrep, but
# (a) the launch stamp is never in the godot cmdline (the
# autoplay-batch.sh runtime stamp is, and the two are distinct), and
# (b) `pgrep -af "godot.*${STAMP}" | grep "godot --path"` self-matched
# the ssh probe itself (which contains both substrings as literals).
# Under the docker wrapper the launcher.sh blocks on `docker run --rm`
# and the trap docker-kills the container on exit, so container
# existence is the correct liveness signal.
# We also read seeds_total from the first submode dir if present.
#
# The godot-proc count is load-bearing: `flatpak run` detaches into a
# systemd user scope, so autoplay-batch.sh's `wait` returns and
# completion.marker is touched while the actual godot processes are still
# running headless games. Without checking live procs, fetch would pull
# mid-run turn_stats with outcome=in_progress and the consumer would
# think the gate failed when in fact games hadn't finished yet.
PROBE='set +e
IS_ACTIVE=$(systemctl --user is-active '"${UNIT}"' 2>/dev/null || echo unknown)
IS_ACTIVE=$(systemctl --user is-active '"${UNIT}"' 2>/dev/null || true)
IS_ACTIVE=${IS_ACTIVE:-unknown}
MARKER_COUNT=$(ls "$HOME/.cache/mc-batches/'"${QUERY_STAMP}"'"/*/completion.marker 2>/dev/null | wc -l | tr -d " ")
STATS_COUNT=$(ls "$HOME/.cache/mc-batches/'"${QUERY_STAMP}"'"/*/game_*/turn_stats.jsonl 2>/dev/null | wc -l | tr -d " ")
GODOT_PROCS=$(pgrep -af "godot.*'"${QUERY_STAMP}"'" 2>/dev/null | grep -c "godot --path" || echo 0)
CONTAINER_LIVE=$(docker ps --filter "name=mc-batch-'"${QUERY_STAMP}"'-run" --quiet 2>/dev/null | wc -l | tr -d " ")
SEEDS_TOTAL=$(cat "$HOME/.cache/mc-batches/'"${QUERY_STAMP}"'"/*/seeds_total 2>/dev/null | head -1)
SEEDS_TOTAL=${SEEDS_TOTAL:-0}
printf "%s|%s|%s|%s|%s\n" "$IS_ACTIVE" "$MARKER_COUNT" "$STATS_COUNT" "$SEEDS_TOTAL" "$GODOT_PROCS"'
printf "%s|%s|%s|%s|%s\n" "$IS_ACTIVE" "$MARKER_COUNT" "$STATS_COUNT" "$SEEDS_TOTAL" "$CONTAINER_LIVE"'
PROBE_OUT="$(ssh -o ConnectTimeout=5 -o BatchMode=yes "${APRICOT}" "${PROBE}" 2>/dev/null)" || PROBE_OUT=""
@ -361,21 +368,21 @@ if [[ "${MODE}" == "status" ]]; then
exit 0
fi
IFS='|' read -r IS_ACTIVE MARKER_COUNT STATS_COUNT SEEDS_TOTAL GODOT_PROCS <<<"${PROBE_OUT}"
IFS='|' read -r IS_ACTIVE MARKER_COUNT STATS_COUNT SEEDS_TOTAL CONTAINER_LIVE <<<"${PROBE_OUT}"
MARKER_COUNT="${MARKER_COUNT:-0}"
STATS_COUNT="${STATS_COUNT:-0}"
SEEDS_TOTAL="${SEEDS_TOTAL:-0}"
GODOT_PROCS="${GODOT_PROCS:-0}"
CONTAINER_LIVE="${CONTAINER_LIVE:-0}"
if [[ "${MARKER_COUNT}" -gt 0 && "${GODOT_PROCS}" -eq 0 ]]; then
if [[ "${MARKER_COUNT}" -gt 0 && "${CONTAINER_LIVE}" -eq 0 ]]; then
STATE="complete"
MARKER_BOOL="true"
elif [[ "${MARKER_COUNT}" -gt 0 && "${GODOT_PROCS}" -gt 0 ]]; then
# Batch script returned (touched completion.marker) but flatpak-
# detached godot processes are still playing games. Status remains
# `running` so `fetch` won't pull mid-run turn_stats snapshots.
elif [[ "${CONTAINER_LIVE}" -gt 0 ]]; then
# Container still running — batch is in-flight even if the launcher
# has somehow touched a stale marker (it shouldn't, the touch is
# post-batch, but be defensive).
STATE="running"
MARKER_BOOL="true"
MARKER_BOOL=$([[ "${MARKER_COUNT}" -gt 0 ]] && echo "true" || echo "false")
elif [[ "${IS_ACTIVE}" == "active" || "${IS_ACTIVE}" == "activating" ]]; then
STATE="running"
MARKER_BOOL="false"
@ -384,8 +391,8 @@ if [[ "${MODE}" == "status" ]]; then
MARKER_BOOL="false"
fi
printf '{"stamp":"%s","state":"%s","seeds_done":%s,"seeds_total":%s,"completion_marker":%s,"godot_procs":%s}\n' \
"${QUERY_STAMP}" "${STATE}" "${STATS_COUNT}" "${SEEDS_TOTAL}" "${MARKER_BOOL}" "${GODOT_PROCS}"
printf '{"stamp":"%s","state":"%s","seeds_done":%s,"seeds_total":%s,"completion_marker":%s,"container_live":%s}\n' \
"${QUERY_STAMP}" "${STATE}" "${STATS_COUNT}" "${SEEDS_TOTAL}" "${MARKER_BOOL}" "${CONTAINER_LIVE}"
exit 0
fi