added monitoring script for Pandora
This commit is contained in:
108
pandora_monitor.sh
Normal file
108
pandora_monitor.sh
Normal file
@@ -0,0 +1,108 @@
|
||||
#!/bin/bash
|
||||
# Monitoring script for pandora machine
|
||||
# Checks if vorgabenportal.knowyoursecurity.com returns 502 Bad Gateway (indicates pandora hung)
|
||||
# If 502 detected for >5 minutes, power-cycles the machine via Home Assistant
|
||||
|
||||
# Configuration
|
||||
TARGET_URL="https://vorgabenportal.knowyoursecurity.com"
|
||||
HA_URL="http://homeassistant.local:8123" # Home Assistant URL
|
||||
HA_TOKEN="YOUR_LONG_LIVED_ACCESS_TOKEN" # Get from HA: Profile > Long-Lived Access Tokens
|
||||
HA_ENTITY="switch.pandora_power" # Entity ID of the smart switch/outlet
|
||||
LOG_FILE="/var/log/pandora_monitor.log"
|
||||
MAX_FAILURES=2 # Reboot after 2 consecutive failures
|
||||
FAILURE_COUNT=0
|
||||
FAILURE_START_TIME=0
|
||||
GRACE_PERIOD=300 # 5 minutes grace for deployments
|
||||
CHECK_INTERVAL=60 # Check every 1 minute (more granular during grace period)
|
||||
|
||||
# Function to log messages
|
||||
log_message() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
# Function to check if machine is responsive
|
||||
# If pandora hangs, the web service behind it returns 502 Bad Gateway
|
||||
# Returns the HTTP code for caller to evaluate
|
||||
check_responsive() {
|
||||
local http_code=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
--max-time 10 \
|
||||
--connect-timeout 5 \
|
||||
"$TARGET_URL" 2>/dev/null)
|
||||
|
||||
echo "$http_code"
|
||||
}
|
||||
|
||||
# Function to power cycle via Home Assistant
|
||||
power_cycle() {
|
||||
log_message "⚠️ ALERT: Service unresponsive for $FAILURE_COUNT consecutive checks! Triggering power cycle..."
|
||||
|
||||
# Turn off the switch
|
||||
curl -s -X POST \
|
||||
-H "Authorization: Bearer $HA_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"entity_id":"'"$HA_ENTITY"'"}' \
|
||||
"$HA_URL/api/services/switch/turn_off" &>/dev/null
|
||||
|
||||
log_message "Power OFF command sent to $HA_ENTITY"
|
||||
sleep 10 # Wait 10 seconds for machine to shut down
|
||||
|
||||
# Turn on the switch
|
||||
curl -s -X POST \
|
||||
-H "Authorization: Bearer $HA_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"entity_id":"'"$HA_ENTITY"'"}' \
|
||||
"$HA_URL/api/services/switch/turn_on" &>/dev/null
|
||||
|
||||
log_message "Power ON command sent to $HA_ENTITY - Pandora rebooting"
|
||||
}
|
||||
|
||||
# Main monitoring loop
|
||||
log_message "========================================"
|
||||
log_message "Starting pandora monitoring service"
|
||||
log_message "Monitoring: $TARGET_URL"
|
||||
log_message "HA Entity: $HA_ENTITY"
|
||||
log_message "Check interval: ${CHECK_INTERVAL}s"
|
||||
log_message "Grace period: ${GRACE_PERIOD}s"
|
||||
log_message "========================================"
|
||||
|
||||
while true; do
|
||||
http_code=$(check_responsive)
|
||||
current_time=$(date +%s)
|
||||
|
||||
# Check if service is OK (not 502 and not timeout)
|
||||
if [ "$http_code" != "502" ] && [ "$http_code" != "000" ]; then
|
||||
# Service is healthy
|
||||
if [ $FAILURE_COUNT -gt 0 ]; then
|
||||
log_message "✓ Service recovered! (was down $FAILURE_COUNT times)"
|
||||
fi
|
||||
FAILURE_COUNT=0
|
||||
FAILURE_START_TIME=0
|
||||
log_message "✓ HTTP $http_code from $TARGET_URL (healthy)"
|
||||
else
|
||||
# Service returned 502 or timeout
|
||||
if [ $FAILURE_COUNT -eq 0 ]; then
|
||||
# First failure - start grace period
|
||||
FAILURE_START_TIME=$current_time
|
||||
FAILURE_COUNT=1
|
||||
log_message "⚠️ First 502/timeout detected (HTTP $http_code) - Starting 5 minute grace period"
|
||||
else
|
||||
# Already in failure state
|
||||
FAILURE_COUNT=$((FAILURE_COUNT + 1))
|
||||
elapsed=$((current_time - FAILURE_START_TIME))
|
||||
remaining=$((GRACE_PERIOD - elapsed))
|
||||
|
||||
if [ $remaining -gt 0 ]; then
|
||||
log_message "⚠️ Still seeing 502 (HTTP $http_code) - Grace period: ${remaining}s remaining"
|
||||
else
|
||||
# Grace period expired - trigger reboot
|
||||
log_message "🔴 CRITICAL: Service unresponsive for >5 minutes! Triggering power cycle..."
|
||||
power_cycle
|
||||
FAILURE_COUNT=0
|
||||
FAILURE_START_TIME=0
|
||||
sleep 120 # Wait 2 minutes after reboot attempt before checking again
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
sleep "$CHECK_INTERVAL"
|
||||
done
|
||||
Reference in New Issue
Block a user