#!/bin/bash # Thinkcentre Monitor - Kubernetes Machine Health Monitor with Home Assistant Integration # Detects hung machines (502 errors) and auto-reboots them via Home Assistant # Configuration from environment variables with sensible defaults TARGET_URL="${TARGET_URL:-http://localhost:8080}" HA_URL="${HA_URL:-http://homeassistant:8123}" HA_TOKEN="${HA_TOKEN}" HA_ENTITY="${HA_ENTITY:-switch.thinkcentre_power}" LOG_FILE="${LOG_FILE:-/var/log/thinkcenter_monitor.log}" GRACE_PERIOD="${GRACE_PERIOD:-300}" # 5 minutes in seconds CHECK_INTERVAL="${CHECK_INTERVAL:-30}" # 30 seconds between checks # State variables ERROR_START_TIME="" IN_GRACE_PERIOD=false LAST_RESPONSE_CODE="" # Validate required configuration if [[ -z "$HA_TOKEN" ]]; then echo "ERROR: HA_TOKEN environment variable is not set. Exiting." | tee -a "$LOG_FILE" exit 1 fi # Initialize log file mkdir -p "$(dirname "$LOG_FILE")" touch "$LOG_FILE" log() { local message="$1" local timestamp=$(date '+%Y-%m-%d %H:%M:%S') echo "[$timestamp] $message" | tee -a "$LOG_FILE" } check_target() { # Use curl to get HTTP response code without following redirects local response_code=$(curl -s -o /dev/null -w "%{http_code}" -m 5 "$TARGET_URL" 2>/dev/null) echo "$response_code" } trigger_power_cycle() { local entity="$1" log "ALERT: Triggering power cycle for entity: $entity" # Turn off log "Sending turn_off request to Home Assistant..." curl -s -X POST \ -H "Authorization: Bearer $HA_TOKEN" \ -H "Content-Type: application/json" \ -d "{\"entity_id\": \"$entity\"}" \ "$HA_URL/api/services/switch/turn_off" > /dev/null 2>&1 if [[ $? -eq 0 ]]; then log "Turn off request sent successfully" else log "ERROR: Failed to send turn_off request" fi # Wait 10 seconds log "Waiting 10 seconds before power-on..." sleep 10 # Turn on log "Sending turn_on request to Home Assistant..." curl -s -X POST \ -H "Authorization: Bearer $HA_TOKEN" \ -H "Content-Type: application/json" \ -d "{\"entity_id\": \"$entity\"}" \ "$HA_URL/api/services/switch/turn_on" > /dev/null 2>&1 if [[ $? -eq 0 ]]; then log "Turn on request sent successfully" log "Power cycle completed for $entity" else log "ERROR: Failed to send turn_on request" fi # Reset state ERROR_START_TIME="" IN_GRACE_PERIOD=false } log "=== Thinkcentre Monitor Started ===" log "Configuration:" log " TARGET_URL: $TARGET_URL" log " HA_URL: $HA_URL" log " HA_ENTITY: $HA_ENTITY" log " GRACE_PERIOD: ${GRACE_PERIOD}s ($(( GRACE_PERIOD / 60 )) minutes)" log " CHECK_INTERVAL: ${CHECK_INTERVAL}s" # Main monitoring loop while true; do RESPONSE_CODE=$(check_target) # Log all responses log "HTTP Response: $RESPONSE_CODE" if [[ "$RESPONSE_CODE" == "502" ]]; then # 502 Bad Gateway error detected if [[ -z "$ERROR_START_TIME" ]]; then # First 502 error - start grace period ERROR_START_TIME=$(date +%s) IN_GRACE_PERIOD=true log "502 error detected - starting 5-minute grace period (recovery window for deployment scenarios)" else # Already in grace period - check if it has expired CURRENT_TIME=$(date +%s) ELAPSED=$((CURRENT_TIME - ERROR_START_TIME)) if [[ $ELAPSED -ge $GRACE_PERIOD ]]; then # Grace period expired - trigger power cycle log "Grace period expired after ${ELAPSED}s. Service still unavailable." trigger_power_cycle "$HA_ENTITY" ERROR_START_TIME="" IN_GRACE_PERIOD=false else # Still within grace period REMAINING=$((GRACE_PERIOD - ELAPSED)) log "Still in grace period. Service recovery window: ${REMAINING}s remaining" fi fi else # Service is responding (any code other than 502) if [[ "$IN_GRACE_PERIOD" == true ]]; then # Service recovered during grace period CURRENT_TIME=$(date +%s) RECOVERY_TIME=$((CURRENT_TIME - ERROR_START_TIME)) log "Service recovered during grace period after ${RECOVERY_TIME}s. Resetting error state." ERROR_START_TIME="" IN_GRACE_PERIOD=false elif [[ "$RESPONSE_CODE" != "000" ]]; then # Service is healthy (not a timeout) if [[ -n "$LAST_RESPONSE_CODE" && "$LAST_RESPONSE_CODE" != "$RESPONSE_CODE" ]]; then log "Service status changed from $LAST_RESPONSE_CODE to $RESPONSE_CODE" fi fi fi LAST_RESPONSE_CODE="$RESPONSE_CODE" # Wait for next check sleep "$CHECK_INTERVAL" done