#!/usr/bin/env bash
# watchdog.sh - APEX v16 watchdog + daily maintenance
# Esegui in screen 'apex_watchdog' (vedi alias 'watchdog' in ~/.bashrc)

set -u

APEX_DIR="$HOME/apex_v16"
ENV_FILE="$APEX_DIR/.env"

# --- Risolvi account dal .env (default: ineligible) ---
ACCOUNT="ineligible"
if [[ -f "$ENV_FILE" ]]; then
    env_account=$(grep -E '^ACCOUNT=' "$ENV_FILE" | tail -n1 | cut -d= -f2- | tr -d '"' | tr -d "'" | xargs)
    if [[ -n "${env_account:-}" ]]; then
        ACCOUNT="$env_account"
    fi
fi

LOG_DIR="$APEX_DIR/logs/live_${ACCOUNT}"
SYSTEM_LOG="$LOG_DIR/system.log"
WATCHDOG_LOG="$LOG_DIR/watchdog.log"
LAST_CLEANUP_MARK="$LOG_DIR/.watchdog_last_cleanup"
STATE_FILE="$APEX_DIR/state/state_live_${ACCOUNT}.json"

# Flag per anti-spam log halt: 1 dopo prima rilevazione, resettato quando
# il bot riprende a scrivere fresh (transizione halted→running).
HALT_LOGGED=0

STALE_SECONDS=600        # >10 min senza scrittura su system.log → restart
CHECK_INTERVAL=60        # polling watchdog
CLEANUP_INTERVAL=86400   # 24h tra cleanup
COMPRESS_AGE_DAYS=30     # log rotati > 30gg → .gz

MAIN_SCREEN="apex_main"
MAIN_PATTERN="main.py --mode live --account ${ACCOUNT}"
MAIN_CMD="cd $APEX_DIR && source venv/bin/activate && python3 main.py --mode live --account ${ACCOUNT} --env-file .env"

mkdir -p "$LOG_DIR"
touch "$WATCHDOG_LOG"

log_event() {
    # log_event <level> <message>
    local ts
    ts="$(date '+%Y-%m-%d %H:%M:%S')"
    echo "[$ts] [$1] $2" >> "$WATCHDOG_LOG"
}

is_bot_halted() {
    # Exit 0 se state file dice halted=true, altrimenti 1.
    # Su errori di lettura/parse: 1 (non bloccare restart se lo stato è illeggibile).
    [[ -f "$STATE_FILE" ]] || return 1
    python3 - "$STATE_FILE" <<'PY' 2>/dev/null
import json, sys
try:
    with open(sys.argv[1]) as f:
        s = json.load(f)
    sys.exit(0 if s.get("halted") is True else 1)
except Exception:
    sys.exit(1)
PY
}

get_halt_reason() {
    [[ -f "$STATE_FILE" ]] || { echo ""; return; }
    python3 - "$STATE_FILE" <<'PY' 2>/dev/null
import json, sys
try:
    with open(sys.argv[1]) as f:
        s = json.load(f)
    print(s.get("halt_reason") or "")
except Exception:
    print("")
PY
}

log_halt_once() {
    if (( HALT_LOGGED == 0 )); then
        local reason
        reason=$(get_halt_reason)
        if [[ -n "$reason" ]]; then
            log_event "HALT" "bot halted (daily target/stop) — reason: $reason"
        else
            log_event "HALT" "bot halted (daily target/stop)"
        fi
        HALT_LOGGED=1
    fi
}

restart_main() {
    local reason="$1"
    log_event "RESTART" "$reason"

    pkill -f "$MAIN_PATTERN" 2>/dev/null
    sleep 2
    pkill -9 -f "$MAIN_PATTERN" 2>/dev/null
    sleep 1

    screen -wipe >/dev/null 2>&1
    screen -dmS "$MAIN_SCREEN" bash -c "$MAIN_CMD"
    log_event "RESTART" "main.py respawned in screen '$MAIN_SCREEN' (account=$ACCOUNT)"
}

check_liveness() {
    if [[ ! -f "$SYSTEM_LOG" ]]; then
        if is_bot_halted; then
            log_halt_once
            return
        fi
        restart_main "system.log assente ($SYSTEM_LOG)"
        return
    fi

    local last_mtime now age
    last_mtime=$(stat -c %Y "$SYSTEM_LOG" 2>/dev/null || echo 0)
    now=$(date +%s)
    age=$(( now - last_mtime ))

    if (( age > STALE_SECONDS )); then
        if is_bot_halted; then
            log_halt_once
            return
        fi
        if ! pgrep -f "$MAIN_PATTERN" >/dev/null; then
            restart_main "main.py non in esecuzione (system.log age=${age}s)"
        else
            restart_main "system.log fermo da ${age}s (>${STALE_SECONDS}s)"
        fi
    else
        # log fresco → bot vivo. Reset flag halt per consentire log alla prossima transizione.
        HALT_LOGGED=0
    fi
}

daily_cleanup() {
    log_event "CLEANUP" "start"

    # 1) __pycache__ ovunque in apex_v16, escluso venv/
    local pycache_count
    pycache_count=$(find "$APEX_DIR" -path "$APEX_DIR/venv" -prune -o \
                        -type d -name "__pycache__" -print 2>/dev/null | wc -l)
    find "$APEX_DIR" -path "$APEX_DIR/venv" -prune -o \
         -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null
    log_event "CLEANUP" "__pycache__ rimossi: $pycache_count dir"

    # 2) Compressione log rotati >30gg in tutte le sottocartelle di logs/
    #    Target: system.log.YYYY-MM-DD, brain_log.jsonl.YYYY-MM-DD (non .gz)
    #    NON tocca log correnti (system.log, brain_log.jsonl) né state/
    local compressed=0
    while IFS= read -r f; do
        if gzip -9 "$f" 2>/dev/null; then
            compressed=$((compressed + 1))
        fi
    done < <(find "$APEX_DIR/logs" -type f \
                  \( -name "system.log.*" -o -name "brain_log.jsonl.*" \) \
                  ! -name "*.gz" \
                  -mtime +${COMPRESS_AGE_DAYS} 2>/dev/null)
    log_event "CLEANUP" "log compressi (.gz): $compressed file"

    log_event "CLEANUP" "done"
    date +%s > "$LAST_CLEANUP_MARK"
}

needs_cleanup() {
    if [[ ! -f "$LAST_CLEANUP_MARK" ]]; then
        return 0
    fi
    local last now
    last=$(cat "$LAST_CLEANUP_MARK" 2>/dev/null || echo 0)
    now=$(date +%s)
    (( now - last >= CLEANUP_INTERVAL ))
}

log_event "BOOT" "watchdog avviato (PID=$$, account=$ACCOUNT, stale=${STALE_SECONDS}s)"

while true; do
    check_liveness
    if needs_cleanup; then
        daily_cleanup
    fi
    sleep "$CHECK_INTERVAL"
done
