Thinkcentre-watchdog/pandora_monitor.sh

#!/bin/bash
# Monitoring script for pandora machine
# Checks if vorgabenportal.knowyoursecurity.com returns 502 Bad Gateway (indicates pandora hung)
# If 502 detected for >5 minutes, power-cycles the machine via Home Assistant

# Configuration
TARGET_URL="https://vorgabenportal.knowyoursecurity.com"
HA_URL="http://homeassistant.local:8123"  # Home Assistant URL
HA_TOKEN="YOUR_LONG_LIVED_ACCESS_TOKEN"   # Get from HA: Profile > Long-Lived Access Tokens
HA_ENTITY="switch.pandora_power"          # Entity ID of the smart switch/outlet
LOG_FILE="/var/log/pandora_monitor.log"
MAX_FAILURES=2  # Reboot after 2 consecutive failures
FAILURE_COUNT=0
FAILURE_START_TIME=0
GRACE_PERIOD=300  # 5 minutes grace for deployments
CHECK_INTERVAL=60  # Check every 1 minute (more granular during grace period)

# Function to log messages
log_message() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}

# Function to check if machine is responsive
# If pandora hangs, the web service behind it returns 502 Bad Gateway
# Returns the HTTP code for caller to evaluate
check_responsive() {
    local http_code=$(curl -s -o /dev/null -w "%{http_code}" \
        --max-time 10 \
        --connect-timeout 5 \
        "$TARGET_URL" 2>/dev/null)

    echo "$http_code"
}

# Function to power cycle via Home Assistant
power_cycle() {
    log_message "⚠️  ALERT: Service unresponsive for $FAILURE_COUNT consecutive checks! Triggering power cycle..."

    # Turn off the switch
    curl -s -X POST \
        -H "Authorization: Bearer $HA_TOKEN" \
        -H "Content-Type: application/json" \
        -d '{"entity_id":"'"$HA_ENTITY"'"}' \
        "$HA_URL/api/services/switch/turn_off" &>/dev/null

    log_message "Power OFF command sent to $HA_ENTITY"
    sleep 10  # Wait 10 seconds for machine to shut down

    # Turn on the switch
    curl -s -X POST \
        -H "Authorization: Bearer $HA_TOKEN" \
        -H "Content-Type: application/json" \
        -d '{"entity_id":"'"$HA_ENTITY"'"}' \
        "$HA_URL/api/services/switch/turn_on" &>/dev/null

    log_message "Power ON command sent to $HA_ENTITY - Pandora rebooting"
}

# Main monitoring loop
log_message "========================================"
log_message "Starting pandora monitoring service"
log_message "Monitoring: $TARGET_URL"
log_message "HA Entity: $HA_ENTITY"
log_message "Check interval: ${CHECK_INTERVAL}s"
log_message "Grace period: ${GRACE_PERIOD}s"
log_message "========================================"

while true; do
    http_code=$(check_responsive)
    current_time=$(date +%s)

    # Check if service is OK (not 502 and not timeout)
    if [ "$http_code" != "502" ] && [ "$http_code" != "000" ]; then
        # Service is healthy
        if [ $FAILURE_COUNT -gt 0 ]; then
            log_message "✓ Service recovered! (was down $FAILURE_COUNT times)"
        fi
        FAILURE_COUNT=0
        FAILURE_START_TIME=0
        log_message "✓ HTTP $http_code from $TARGET_URL (healthy)"
    else
        # Service returned 502 or timeout
        if [ $FAILURE_COUNT -eq 0 ]; then
            # First failure - start grace period
            FAILURE_START_TIME=$current_time
            FAILURE_COUNT=1
            log_message "⚠️  First 502/timeout detected (HTTP $http_code) - Starting 5 minute grace period"
        else
            # Already in failure state
            FAILURE_COUNT=$((FAILURE_COUNT + 1))
            elapsed=$((current_time - FAILURE_START_TIME))
            remaining=$((GRACE_PERIOD - elapsed))

            if [ $remaining -gt 0 ]; then
                log_message "⚠️  Still seeing 502 (HTTP $http_code) - Grace period: ${remaining}s remaining"
            else
                # Grace period expired - trigger reboot
                log_message "🔴 CRITICAL: Service unresponsive for >5 minutes! Triggering power cycle..."
                power_cycle
                FAILURE_COUNT=0
                FAILURE_START_TIME=0
                sleep 120  # Wait 2 minutes after reboot attempt before checking again
            fi
        fi
    fi

    sleep "$CHECK_INTERVAL"
done