diff --git a/pandora_monitor.sh b/pandora_monitor.sh new file mode 100644 index 0000000..bd363f2 --- /dev/null +++ b/pandora_monitor.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# Monitoring script for pandora machine +# Checks if vorgabenportal.knowyoursecurity.com returns 502 Bad Gateway (indicates pandora hung) +# If 502 detected for >5 minutes, power-cycles the machine via Home Assistant + +# Configuration +TARGET_URL="https://vorgabenportal.knowyoursecurity.com" +HA_URL="http://homeassistant.local:8123" # Home Assistant URL +HA_TOKEN="YOUR_LONG_LIVED_ACCESS_TOKEN" # Get from HA: Profile > Long-Lived Access Tokens +HA_ENTITY="switch.pandora_power" # Entity ID of the smart switch/outlet +LOG_FILE="/var/log/pandora_monitor.log" +MAX_FAILURES=2 # Reboot after 2 consecutive failures +FAILURE_COUNT=0 +FAILURE_START_TIME=0 +GRACE_PERIOD=300 # 5 minutes grace for deployments +CHECK_INTERVAL=60 # Check every 1 minute (more granular during grace period) + +# Function to log messages +log_message() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" +} + +# Function to check if machine is responsive +# If pandora hangs, the web service behind it returns 502 Bad Gateway +# Returns the HTTP code for caller to evaluate +check_responsive() { + local http_code=$(curl -s -o /dev/null -w "%{http_code}" \ + --max-time 10 \ + --connect-timeout 5 \ + "$TARGET_URL" 2>/dev/null) + + echo "$http_code" +} + +# Function to power cycle via Home Assistant +power_cycle() { + log_message "⚠️ ALERT: Service unresponsive for $FAILURE_COUNT consecutive checks! Triggering power cycle..." + + # Turn off the switch + curl -s -X POST \ + -H "Authorization: Bearer $HA_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"entity_id":"'"$HA_ENTITY"'"}' \ + "$HA_URL/api/services/switch/turn_off" &>/dev/null + + log_message "Power OFF command sent to $HA_ENTITY" + sleep 10 # Wait 10 seconds for machine to shut down + + # Turn on the switch + curl -s -X POST \ + -H "Authorization: Bearer $HA_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"entity_id":"'"$HA_ENTITY"'"}' \ + "$HA_URL/api/services/switch/turn_on" &>/dev/null + + log_message "Power ON command sent to $HA_ENTITY - Pandora rebooting" +} + +# Main monitoring loop +log_message "========================================" +log_message "Starting pandora monitoring service" +log_message "Monitoring: $TARGET_URL" +log_message "HA Entity: $HA_ENTITY" +log_message "Check interval: ${CHECK_INTERVAL}s" +log_message "Grace period: ${GRACE_PERIOD}s" +log_message "========================================" + +while true; do + http_code=$(check_responsive) + current_time=$(date +%s) + + # Check if service is OK (not 502 and not timeout) + if [ "$http_code" != "502" ] && [ "$http_code" != "000" ]; then + # Service is healthy + if [ $FAILURE_COUNT -gt 0 ]; then + log_message "✓ Service recovered! (was down $FAILURE_COUNT times)" + fi + FAILURE_COUNT=0 + FAILURE_START_TIME=0 + log_message "✓ HTTP $http_code from $TARGET_URL (healthy)" + else + # Service returned 502 or timeout + if [ $FAILURE_COUNT -eq 0 ]; then + # First failure - start grace period + FAILURE_START_TIME=$current_time + FAILURE_COUNT=1 + log_message "⚠️ First 502/timeout detected (HTTP $http_code) - Starting 5 minute grace period" + else + # Already in failure state + FAILURE_COUNT=$((FAILURE_COUNT + 1)) + elapsed=$((current_time - FAILURE_START_TIME)) + remaining=$((GRACE_PERIOD - elapsed)) + + if [ $remaining -gt 0 ]; then + log_message "⚠️ Still seeing 502 (HTTP $http_code) - Grace period: ${remaining}s remaining" + else + # Grace period expired - trigger reboot + log_message "🔴 CRITICAL: Service unresponsive for >5 minutes! Triggering power cycle..." + power_cycle + FAILURE_COUNT=0 + FAILURE_START_TIME=0 + sleep 120 # Wait 2 minutes after reboot attempt before checking again + fi + fi + fi + + sleep "$CHECK_INTERVAL" +done