#!/bin/bash # Thinkcentre Monitor - Kubernetes Machine Health Monitor with Home Assistant Integration # Detects hung machines (502 errors) and auto-reboots them via Home Assistant # Configuration from environment variables with sensible defaults TARGET_URL="${TARGET_URL:-http://localhost:8080}" HA_URL="${HA_URL:-http://homeassistant:8123}" HA_TOKEN="${HA_TOKEN}" HA_ENTITY="${HA_ENTITY:-switch.thinkcentre_power}" LOG_FILE="${LOG_FILE:-/var/log/thinkcenter_monitor.log}" GRACE_PERIOD="${GRACE_PERIOD:-300}" # 5 minutes in seconds CHECK_INTERVAL="${CHECK_INTERVAL:-30}" # 30 seconds between checks # State variables ERROR_START_TIME="" IN_GRACE_PERIOD=false LAST_RESPONSE_CODE="" # Validate required configuration if [[ -z "$HA_TOKEN" ]]; then echo "ERROR: HA_TOKEN environment variable is not set. Exiting." | tee -a "$LOG_FILE" exit 1 fi # Initialize log file mkdir -p "$(dirname "$LOG_FILE")" touch "$LOG_FILE" log() { local message="$1" local timestamp=$(date '+%Y-%m-%d %H:%M:%S') echo "[$timestamp] $message" | tee -a "$LOG_FILE" } test_ha_connection() { # Test Home Assistant connectivity at startup log "Testing Home Assistant connection..." local http_code=$(curl -s -o /dev/null -w "%{http_code}" -H "Authorization: Bearer $HA_TOKEN" "$HA_URL/api/" 2>/dev/null) if [[ "$http_code" == "200" ]]; then log "Home Assistant API connection successful" return 0 else log "ERROR: Home Assistant API returned HTTP $http_code" return 1 fi } check_target() { # Use curl to get HTTP response code without following redirects local response_code=$(curl -s -o /dev/null -w "%{http_code}" -m 5 "$TARGET_URL" 2>/dev/null) echo "$response_code" } get_entity_domain() { # Extract domain from entity_id (e.g., "switch" from "switch.device_name") local entity="$1" echo "${entity%%.*}" } get_toggle_service() { # Determine correct service based on entity domain local domain="$1" case "$domain" in switch) echo "switch/turn_off" ;; light) echo "light/turn_off" ;; *) echo "switch/turn_off" ;; # Default fallback esac } trigger_power_cycle() { local entity="$1" local domain=$(get_entity_domain "$entity") local turn_off_service=$(get_toggle_service "$domain") local turn_on_service="${turn_off_service%/*}/turn_on" log "ALERT: Triggering power cycle for entity: $entity (domain: $domain)" # Turn off log "Sending turn_off request to Home Assistant ($turn_off_service)..." log "DEBUG: POST to $HA_URL/api/services/$turn_off_service with entity_id=$entity" local response=$(curl -s -w "\n%{http_code}" -X POST \ -H "Authorization: Bearer $HA_TOKEN" \ -H "Content-Type: application/json" \ -d "{\"entity_id\": \"$entity\"}" \ "$HA_URL/api/services/$turn_off_service" 2>&1) local http_code=$(echo "$response" | tail -n 1) local body=$(echo "$response" | head -n -1) if [[ "$http_code" =~ ^[2][0-9]{2}$ ]]; then log "Turn off request sent successfully (HTTP $http_code)" else log "ERROR: Failed to send turn_off request (HTTP $http_code)" log "ERROR: Response body: $body" fi # Wait 10 seconds log "Waiting 10 seconds before power-on..." sleep 10 # Turn on log "Sending turn_on request to Home Assistant ($turn_on_service)..." log "DEBUG: POST to $HA_URL/api/services/$turn_on_service with entity_id=$entity" local response=$(curl -s -w "\n%{http_code}" -X POST \ -H "Authorization: Bearer $HA_TOKEN" \ -H "Content-Type: application/json" \ -d "{\"entity_id\": \"$entity\"}" \ "$HA_URL/api/services/$turn_on_service" 2>&1) local http_code=$(echo "$response" | tail -n 1) local body=$(echo "$response" | head -n -1) if [[ "$http_code" =~ ^[2][0-9]{2}$ ]]; then log "Turn on request sent successfully (HTTP $http_code)" log "Power cycle completed for $entity" else log "ERROR: Failed to send turn_on request (HTTP $http_code)" log "ERROR: Response body: $body" fi # Reset state ERROR_START_TIME="" IN_GRACE_PERIOD=false } log "=== Thinkcentre Monitor Started ===" log "Configuration:" log " TARGET_URL: $TARGET_URL" log " HA_URL: $HA_URL" log " HA_ENTITY: $HA_ENTITY" log " GRACE_PERIOD: ${GRACE_PERIOD}s ($(( GRACE_PERIOD / 60 )) minutes)" log " CHECK_INTERVAL: ${CHECK_INTERVAL}s" # Test HA connection before entering main loop if ! test_ha_connection; then log "FATAL: Cannot connect to Home Assistant. Exiting." exit 1 fi # Main monitoring loop while true; do RESPONSE_CODE=$(check_target) # Log all responses log "HTTP Response: $RESPONSE_CODE" if [[ "$RESPONSE_CODE" == "502" ]]; then # 502 Bad Gateway error detected if [[ -z "$ERROR_START_TIME" ]]; then # First 502 error - start grace period ERROR_START_TIME=$(date +%s) IN_GRACE_PERIOD=true log "502 error detected - starting 5-minute grace period (recovery window for deployment scenarios)" else # Already in grace period - check if it has expired CURRENT_TIME=$(date +%s) ELAPSED=$((CURRENT_TIME - ERROR_START_TIME)) if [[ $ELAPSED -ge $GRACE_PERIOD ]]; then # Grace period expired - trigger power cycle log "Grace period expired after ${ELAPSED}s. Service still unavailable." trigger_power_cycle "$HA_ENTITY" ERROR_START_TIME="" IN_GRACE_PERIOD=false else # Still within grace period REMAINING=$((GRACE_PERIOD - ELAPSED)) log "Still in grace period. Service recovery window: ${REMAINING}s remaining" fi fi else # Service is responding (any code other than 502) if [[ "$IN_GRACE_PERIOD" == true ]]; then # Service recovered during grace period CURRENT_TIME=$(date +%s) RECOVERY_TIME=$((CURRENT_TIME - ERROR_START_TIME)) log "Service recovered during grace period after ${RECOVERY_TIME}s. Resetting error state." ERROR_START_TIME="" IN_GRACE_PERIOD=false elif [[ "$RESPONSE_CODE" != "000" ]]; then # Service is healthy (not a timeout) if [[ -n "$LAST_RESPONSE_CODE" && "$LAST_RESPONSE_CODE" != "$RESPONSE_CODE" ]]; then log "Service status changed from $LAST_RESPONSE_CODE to $RESPONSE_CODE" fi fi fi LAST_RESPONSE_CODE="$RESPONSE_CODE" # Wait for next check sleep "$CHECK_INTERVAL" done