196 lines
6.7 KiB
Bash
196 lines
6.7 KiB
Bash
#!/bin/bash
|
|
|
|
# Thinkcentre Monitor - Kubernetes Machine Health Monitor with Home Assistant Integration
|
|
# Detects hung machines (502 errors) and auto-reboots them via Home Assistant
|
|
|
|
# Configuration from environment variables with sensible defaults
|
|
TARGET_URL="${TARGET_URL:-http://localhost:8080}"
|
|
HA_URL="${HA_URL:-http://homeassistant:8123}"
|
|
HA_TOKEN="${HA_TOKEN}"
|
|
HA_ENTITY="${HA_ENTITY:-switch.thinkcentre_power}"
|
|
LOG_FILE="${LOG_FILE:-/var/log/thinkcenter_monitor.log}"
|
|
GRACE_PERIOD="${GRACE_PERIOD:-300}" # 5 minutes in seconds
|
|
CHECK_INTERVAL="${CHECK_INTERVAL:-30}" # 30 seconds between checks
|
|
|
|
# State variables
|
|
ERROR_START_TIME=""
|
|
IN_GRACE_PERIOD=false
|
|
LAST_RESPONSE_CODE=""
|
|
|
|
# Validate required configuration
|
|
if [[ -z "$HA_TOKEN" ]]; then
|
|
echo "ERROR: HA_TOKEN environment variable is not set. Exiting." | tee -a "$LOG_FILE"
|
|
exit 1
|
|
fi
|
|
|
|
# Initialize log file
|
|
mkdir -p "$(dirname "$LOG_FILE")"
|
|
touch "$LOG_FILE"
|
|
|
|
log() {
|
|
local message="$1"
|
|
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
|
echo "[$timestamp] $message" | tee -a "$LOG_FILE"
|
|
}
|
|
|
|
test_ha_connection() {
|
|
# Test Home Assistant connectivity at startup
|
|
log "Testing Home Assistant connection..."
|
|
local http_code=$(curl -s -o /dev/null -w "%{http_code}" -H "Authorization: Bearer $HA_TOKEN" "$HA_URL/api/" 2>/dev/null)
|
|
|
|
if [[ "$http_code" == "200" ]]; then
|
|
log "Home Assistant API connection successful"
|
|
return 0
|
|
else
|
|
log "ERROR: Home Assistant API returned HTTP $http_code"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
check_target() {
|
|
# Use curl to get HTTP response code without following redirects
|
|
local response_code=$(curl -s -o /dev/null -w "%{http_code}" -m 5 "$TARGET_URL" 2>/dev/null)
|
|
echo "$response_code"
|
|
}
|
|
|
|
get_entity_domain() {
|
|
# Extract domain from entity_id (e.g., "switch" from "switch.device_name")
|
|
local entity="$1"
|
|
echo "${entity%%.*}"
|
|
}
|
|
|
|
get_toggle_service() {
|
|
# Determine correct service based on entity domain
|
|
local domain="$1"
|
|
case "$domain" in
|
|
switch) echo "switch/turn_off" ;;
|
|
light) echo "light/turn_off" ;;
|
|
*) echo "switch/turn_off" ;; # Default fallback
|
|
esac
|
|
}
|
|
|
|
trigger_power_cycle() {
|
|
local entity="$1"
|
|
local domain=$(get_entity_domain "$entity")
|
|
local turn_off_service=$(get_toggle_service "$domain")
|
|
local turn_on_service="${turn_off_service%/*}/turn_on"
|
|
|
|
log "ALERT: Triggering power cycle for entity: $entity (domain: $domain)"
|
|
|
|
# Turn off
|
|
log "Sending turn_off request to Home Assistant ($turn_off_service)..."
|
|
log "DEBUG: POST to $HA_URL/api/services/$turn_off_service with entity_id=$entity"
|
|
local response=$(curl -s -w "\n%{http_code}" -X POST \
|
|
-H "Authorization: Bearer $HA_TOKEN" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"entity_id\": \"$entity\"}" \
|
|
"$HA_URL/api/services/$turn_off_service" 2>&1)
|
|
|
|
local http_code=$(echo "$response" | tail -n 1)
|
|
local body=$(echo "$response" | head -n -1)
|
|
|
|
if [[ "$http_code" =~ ^[2][0-9]{2}$ ]]; then
|
|
log "Turn off request sent successfully (HTTP $http_code)"
|
|
else
|
|
log "ERROR: Failed to send turn_off request (HTTP $http_code)"
|
|
log "ERROR: Response body: $body"
|
|
fi
|
|
|
|
# Wait 10 seconds
|
|
log "Waiting 10 seconds before power-on..."
|
|
sleep 10
|
|
|
|
# Turn on
|
|
log "Sending turn_on request to Home Assistant ($turn_on_service)..."
|
|
log "DEBUG: POST to $HA_URL/api/services/$turn_on_service with entity_id=$entity"
|
|
local response=$(curl -s -w "\n%{http_code}" -X POST \
|
|
-H "Authorization: Bearer $HA_TOKEN" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"entity_id\": \"$entity\"}" \
|
|
"$HA_URL/api/services/$turn_on_service" 2>&1)
|
|
|
|
local http_code=$(echo "$response" | tail -n 1)
|
|
local body=$(echo "$response" | head -n -1)
|
|
|
|
if [[ "$http_code" =~ ^[2][0-9]{2}$ ]]; then
|
|
log "Turn on request sent successfully (HTTP $http_code)"
|
|
log "Power cycle completed for $entity"
|
|
else
|
|
log "ERROR: Failed to send turn_on request (HTTP $http_code)"
|
|
log "ERROR: Response body: $body"
|
|
fi
|
|
|
|
# Reset state
|
|
ERROR_START_TIME=""
|
|
IN_GRACE_PERIOD=false
|
|
}
|
|
|
|
log "=== Thinkcentre Monitor Started ==="
|
|
log "Configuration:"
|
|
log " TARGET_URL: $TARGET_URL"
|
|
log " HA_URL: $HA_URL"
|
|
log " HA_ENTITY: $HA_ENTITY"
|
|
log " GRACE_PERIOD: ${GRACE_PERIOD}s ($(( GRACE_PERIOD / 60 )) minutes)"
|
|
log " CHECK_INTERVAL: ${CHECK_INTERVAL}s"
|
|
|
|
# Test HA connection before entering main loop
|
|
if ! test_ha_connection; then
|
|
log "FATAL: Cannot connect to Home Assistant. Exiting."
|
|
exit 1
|
|
fi
|
|
|
|
# Main monitoring loop
|
|
while true; do
|
|
RESPONSE_CODE=$(check_target)
|
|
|
|
# Log all responses
|
|
log "HTTP Response: $RESPONSE_CODE"
|
|
|
|
if [[ "$RESPONSE_CODE" == "502" ]]; then
|
|
# 502 Bad Gateway error detected
|
|
|
|
if [[ -z "$ERROR_START_TIME" ]]; then
|
|
# First 502 error - start grace period
|
|
ERROR_START_TIME=$(date +%s)
|
|
IN_GRACE_PERIOD=true
|
|
log "502 error detected - starting $(( GRACE_PERIOD / 60 ))- minute grace period (recovery window for deployment scenarios)"
|
|
else
|
|
# Already in grace period - check if it has expired
|
|
CURRENT_TIME=$(date +%s)
|
|
ELAPSED=$((CURRENT_TIME - ERROR_START_TIME))
|
|
|
|
if [[ $ELAPSED -ge $GRACE_PERIOD ]]; then
|
|
# Grace period expired - trigger power cycle
|
|
log "Grace period expired after ${ELAPSED}s. Service still unavailable."
|
|
trigger_power_cycle "$HA_ENTITY"
|
|
ERROR_START_TIME=""
|
|
IN_GRACE_PERIOD=false
|
|
else
|
|
# Still within grace period
|
|
REMAINING=$((GRACE_PERIOD - ELAPSED))
|
|
log "Still in grace period. Service recovery window: ${REMAINING}s remaining"
|
|
fi
|
|
fi
|
|
else
|
|
# Service is responding (any code other than 502)
|
|
if [[ "$IN_GRACE_PERIOD" == true ]]; then
|
|
# Service recovered during grace period
|
|
CURRENT_TIME=$(date +%s)
|
|
RECOVERY_TIME=$((CURRENT_TIME - ERROR_START_TIME))
|
|
log "Service recovered during grace period after ${RECOVERY_TIME}s. Resetting error state."
|
|
ERROR_START_TIME=""
|
|
IN_GRACE_PERIOD=false
|
|
elif [[ "$RESPONSE_CODE" != "000" ]]; then
|
|
# Service is healthy (not a timeout)
|
|
if [[ -n "$LAST_RESPONSE_CODE" && "$LAST_RESPONSE_CODE" != "$RESPONSE_CODE" ]]; then
|
|
log "Service status changed from $LAST_RESPONSE_CODE to $RESPONSE_CODE"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
LAST_RESPONSE_CODE="$RESPONSE_CODE"
|
|
|
|
# Wait for next check
|
|
sleep "$CHECK_INTERVAL"
|
|
done
|