added monitoring script for Pandora

2025-12-05 21:59:09 +01:00
parent 058eeb5246
commit a3b13c5fe6
1 changed files with 108 additions and 0 deletions
--- a/pandora_monitor.sh
+++ b/pandora_monitor.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+# Monitoring script for pandora machine
+# Checks if vorgabenportal.knowyoursecurity.com returns 502 Bad Gateway (indicates pandora hung)
+# If 502 detected for >5 minutes, power-cycles the machine via Home Assistant
+
+# Configuration
+TARGET_URL="https://vorgabenportal.knowyoursecurity.com"
+HA_URL="http://homeassistant.local:8123"  # Home Assistant URL
+HA_TOKEN="YOUR_LONG_LIVED_ACCESS_TOKEN"   # Get from HA: Profile > Long-Lived Access Tokens
+HA_ENTITY="switch.pandora_power"          # Entity ID of the smart switch/outlet
+LOG_FILE="/var/log/pandora_monitor.log"
+MAX_FAILURES=2  # Reboot after 2 consecutive failures
+FAILURE_COUNT=0
+FAILURE_START_TIME=0
+GRACE_PERIOD=300  # 5 minutes grace for deployments
+CHECK_INTERVAL=60  # Check every 1 minute (more granular during grace period)
+
+# Function to log messages
+log_message() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
+}
+
+# Function to check if machine is responsive
+# If pandora hangs, the web service behind it returns 502 Bad Gateway
+# Returns the HTTP code for caller to evaluate
+check_responsive() {
+    local http_code=$(curl -s -o /dev/null -w "%{http_code}" \
+        --max-time 10 \
+        --connect-timeout 5 \
+        "$TARGET_URL" 2>/dev/null)
+    
+    echo "$http_code"
+}
+
+# Function to power cycle via Home Assistant
+power_cycle() {
+    log_message "⚠️  ALERT: Service unresponsive for $FAILURE_COUNT consecutive checks! Triggering power cycle..."
+    
+    # Turn off the switch
+    curl -s -X POST \
+        -H "Authorization: Bearer $HA_TOKEN" \
+        -H "Content-Type: application/json" \
+        -d '{"entity_id":"'"$HA_ENTITY"'"}' \
+        "$HA_URL/api/services/switch/turn_off" &>/dev/null
+    
+    log_message "Power OFF command sent to $HA_ENTITY"
+    sleep 10  # Wait 10 seconds for machine to shut down
+    
+    # Turn on the switch
+    curl -s -X POST \
+        -H "Authorization: Bearer $HA_TOKEN" \
+        -H "Content-Type: application/json" \
+        -d '{"entity_id":"'"$HA_ENTITY"'"}' \
+        "$HA_URL/api/services/switch/turn_on" &>/dev/null
+    
+    log_message "Power ON command sent to $HA_ENTITY - Pandora rebooting"
+}
+
+# Main monitoring loop
+log_message "========================================"
+log_message "Starting pandora monitoring service"
+log_message "Monitoring: $TARGET_URL"
+log_message "HA Entity: $HA_ENTITY"
+log_message "Check interval: ${CHECK_INTERVAL}s"
+log_message "Grace period: ${GRACE_PERIOD}s"
+log_message "========================================"
+
+while true; do
+    http_code=$(check_responsive)
+    current_time=$(date +%s)
+    
+    # Check if service is OK (not 502 and not timeout)
+    if [ "$http_code" != "502" ] && [ "$http_code" != "000" ]; then
+        # Service is healthy
+        if [ $FAILURE_COUNT -gt 0 ]; then
+            log_message "✓ Service recovered! (was down $FAILURE_COUNT times)"
+        fi
+        FAILURE_COUNT=0
+        FAILURE_START_TIME=0
+        log_message "✓ HTTP $http_code from $TARGET_URL (healthy)"
+    else
+        # Service returned 502 or timeout
+        if [ $FAILURE_COUNT -eq 0 ]; then
+            # First failure - start grace period
+            FAILURE_START_TIME=$current_time
+            FAILURE_COUNT=1
+            log_message "⚠️  First 502/timeout detected (HTTP $http_code) - Starting 5 minute grace period"
+        else
+            # Already in failure state
+            FAILURE_COUNT=$((FAILURE_COUNT + 1))
+            elapsed=$((current_time - FAILURE_START_TIME))
+            remaining=$((GRACE_PERIOD - elapsed))
+            
+            if [ $remaining -gt 0 ]; then
+                log_message "⚠️  Still seeing 502 (HTTP $http_code) - Grace period: ${remaining}s remaining"
+            else
+                # Grace period expired - trigger reboot
+                log_message "🔴 CRITICAL: Service unresponsive for >5 minutes! Triggering power cycle..."
+                power_cycle
+                FAILURE_COUNT=0
+                FAILURE_START_TIME=0
+                sleep 120  # Wait 2 minutes after reboot attempt before checking again
+            fi
+        fi
+    fi
+    
+    sleep "$CHECK_INTERVAL"
+done