147 lines
4.8 KiB
Bash
147 lines
4.8 KiB
Bash
#!/bin/bash
|
|
|
|
# Thinkcentre Monitor - Kubernetes Machine Health Monitor with Home Assistant Integration
|
|
# Detects hung machines (502 errors) and auto-reboots them via Home Assistant
|
|
|
|
# Configuration from environment variables with sensible defaults
|
|
TARGET_URL="${TARGET_URL:-http://localhost:8080}"
|
|
HA_URL="${HA_URL:-http://homeassistant:8123}"
|
|
HA_TOKEN="${HA_TOKEN}"
|
|
HA_ENTITY="${HA_ENTITY:-switch.thinkcentre_power}"
|
|
LOG_FILE="${LOG_FILE:-/var/log/thinkcenter_monitor.log}"
|
|
GRACE_PERIOD="${GRACE_PERIOD:-300}" # 5 minutes in seconds
|
|
CHECK_INTERVAL="${CHECK_INTERVAL:-30}" # 30 seconds between checks
|
|
|
|
# State variables
|
|
ERROR_START_TIME=""
|
|
IN_GRACE_PERIOD=false
|
|
LAST_RESPONSE_CODE=""
|
|
|
|
# Validate required configuration
|
|
if [[ -z "$HA_TOKEN" ]]; then
|
|
echo "ERROR: HA_TOKEN environment variable is not set. Exiting." | tee -a "$LOG_FILE"
|
|
exit 1
|
|
fi
|
|
|
|
# Initialize log file
|
|
mkdir -p "$(dirname "$LOG_FILE")"
|
|
touch "$LOG_FILE"
|
|
|
|
log() {
|
|
local message="$1"
|
|
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
|
echo "[$timestamp] $message" | tee -a "$LOG_FILE"
|
|
}
|
|
|
|
check_target() {
|
|
# Use curl to get HTTP response code without following redirects
|
|
local response_code=$(curl -s -o /dev/null -w "%{http_code}" -m 5 "$TARGET_URL" 2>/dev/null)
|
|
echo "$response_code"
|
|
}
|
|
|
|
trigger_power_cycle() {
|
|
local entity="$1"
|
|
|
|
log "ALERT: Triggering power cycle for entity: $entity"
|
|
|
|
# Turn off
|
|
log "Sending turn_off request to Home Assistant..."
|
|
curl -s -X POST \
|
|
-H "Authorization: Bearer $HA_TOKEN" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"entity_id\": \"$entity\"}" \
|
|
"$HA_URL/api/services/switch/turn_off" > /dev/null 2>&1
|
|
|
|
if [[ $? -eq 0 ]]; then
|
|
log "Turn off request sent successfully"
|
|
else
|
|
log "ERROR: Failed to send turn_off request"
|
|
fi
|
|
|
|
# Wait 10 seconds
|
|
log "Waiting 10 seconds before power-on..."
|
|
sleep 10
|
|
|
|
# Turn on
|
|
log "Sending turn_on request to Home Assistant..."
|
|
curl -s -X POST \
|
|
-H "Authorization: Bearer $HA_TOKEN" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"entity_id\": \"$entity\"}" \
|
|
"$HA_URL/api/services/switch/turn_on" > /dev/null 2>&1
|
|
|
|
if [[ $? -eq 0 ]]; then
|
|
log "Turn on request sent successfully"
|
|
log "Power cycle completed for $entity"
|
|
else
|
|
log "ERROR: Failed to send turn_on request"
|
|
fi
|
|
|
|
# Reset state
|
|
ERROR_START_TIME=""
|
|
IN_GRACE_PERIOD=false
|
|
}
|
|
|
|
log "=== Thinkcentre Monitor Started ==="
|
|
log "Configuration:"
|
|
log " TARGET_URL: $TARGET_URL"
|
|
log " HA_URL: $HA_URL"
|
|
log " HA_ENTITY: $HA_ENTITY"
|
|
log " GRACE_PERIOD: ${GRACE_PERIOD}s ($(( GRACE_PERIOD / 60 )) minutes)"
|
|
log " CHECK_INTERVAL: ${CHECK_INTERVAL}s"
|
|
|
|
# Main monitoring loop
|
|
while true; do
|
|
RESPONSE_CODE=$(check_target)
|
|
|
|
# Log all responses
|
|
log "HTTP Response: $RESPONSE_CODE"
|
|
|
|
if [[ "$RESPONSE_CODE" == "502" ]]; then
|
|
# 502 Bad Gateway error detected
|
|
|
|
if [[ -z "$ERROR_START_TIME" ]]; then
|
|
# First 502 error - start grace period
|
|
ERROR_START_TIME=$(date +%s)
|
|
IN_GRACE_PERIOD=true
|
|
log "502 error detected - starting 5-minute grace period (recovery window for deployment scenarios)"
|
|
else
|
|
# Already in grace period - check if it has expired
|
|
CURRENT_TIME=$(date +%s)
|
|
ELAPSED=$((CURRENT_TIME - ERROR_START_TIME))
|
|
|
|
if [[ $ELAPSED -ge $GRACE_PERIOD ]]; then
|
|
# Grace period expired - trigger power cycle
|
|
log "Grace period expired after ${ELAPSED}s. Service still unavailable."
|
|
trigger_power_cycle "$HA_ENTITY"
|
|
ERROR_START_TIME=""
|
|
IN_GRACE_PERIOD=false
|
|
else
|
|
# Still within grace period
|
|
REMAINING=$((GRACE_PERIOD - ELAPSED))
|
|
log "Still in grace period. Service recovery window: ${REMAINING}s remaining"
|
|
fi
|
|
fi
|
|
else
|
|
# Service is responding (any code other than 502)
|
|
if [[ "$IN_GRACE_PERIOD" == true ]]; then
|
|
# Service recovered during grace period
|
|
CURRENT_TIME=$(date +%s)
|
|
RECOVERY_TIME=$((CURRENT_TIME - ERROR_START_TIME))
|
|
log "Service recovered during grace period after ${RECOVERY_TIME}s. Resetting error state."
|
|
ERROR_START_TIME=""
|
|
IN_GRACE_PERIOD=false
|
|
elif [[ "$RESPONSE_CODE" != "000" ]]; then
|
|
# Service is healthy (not a timeout)
|
|
if [[ -n "$LAST_RESPONSE_CODE" && "$LAST_RESPONSE_CODE" != "$RESPONSE_CODE" ]]; then
|
|
log "Service status changed from $LAST_RESPONSE_CODE to $RESPONSE_CODE"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
LAST_RESPONSE_CODE="$RESPONSE_CODE"
|
|
|
|
# Wait for next check
|
|
sleep "$CHECK_INTERVAL"
|
|
done
|