Files
Thinkcentre-watchdog/thinkcenter_monitor.sh

196 lines
6.7 KiB
Bash

#!/bin/bash
# Thinkcentre Monitor - Kubernetes Machine Health Monitor with Home Assistant Integration
# Detects hung machines (502 errors) and auto-reboots them via Home Assistant
# Configuration from environment variables with sensible defaults
TARGET_URL="${TARGET_URL:-http://localhost:8080}"
HA_URL="${HA_URL:-http://homeassistant:8123}"
HA_TOKEN="${HA_TOKEN}"
HA_ENTITY="${HA_ENTITY:-switch.thinkcentre_power}"
LOG_FILE="${LOG_FILE:-/var/log/thinkcenter_monitor.log}"
GRACE_PERIOD="${GRACE_PERIOD:-300}" # 5 minutes in seconds
CHECK_INTERVAL="${CHECK_INTERVAL:-30}" # 30 seconds between checks
# State variables
ERROR_START_TIME=""
IN_GRACE_PERIOD=false
LAST_RESPONSE_CODE=""
# Validate required configuration
if [[ -z "$HA_TOKEN" ]]; then
echo "ERROR: HA_TOKEN environment variable is not set. Exiting." | tee -a "$LOG_FILE"
exit 1
fi
# Initialize log file
mkdir -p "$(dirname "$LOG_FILE")"
touch "$LOG_FILE"
log() {
local message="$1"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] $message" | tee -a "$LOG_FILE"
}
test_ha_connection() {
# Test Home Assistant connectivity at startup
log "Testing Home Assistant connection..."
local http_code=$(curl -s -o /dev/null -w "%{http_code}" -H "Authorization: Bearer $HA_TOKEN" "$HA_URL/api/" 2>/dev/null)
if [[ "$http_code" == "200" ]]; then
log "Home Assistant API connection successful"
return 0
else
log "ERROR: Home Assistant API returned HTTP $http_code"
return 1
fi
}
check_target() {
# Use curl to get HTTP response code without following redirects
local response_code=$(curl -s -o /dev/null -w "%{http_code}" -m 5 "$TARGET_URL" 2>/dev/null)
echo "$response_code"
}
get_entity_domain() {
# Extract domain from entity_id (e.g., "switch" from "switch.device_name")
local entity="$1"
echo "${entity%%.*}"
}
get_toggle_service() {
# Determine correct service based on entity domain
local domain="$1"
case "$domain" in
switch) echo "switch/turn_off" ;;
light) echo "light/turn_off" ;;
*) echo "switch/turn_off" ;; # Default fallback
esac
}
trigger_power_cycle() {
local entity="$1"
local domain=$(get_entity_domain "$entity")
local turn_off_service=$(get_toggle_service "$domain")
local turn_on_service="${turn_off_service%/*}/turn_on"
log "ALERT: Triggering power cycle for entity: $entity (domain: $domain)"
# Turn off
log "Sending turn_off request to Home Assistant ($turn_off_service)..."
log "DEBUG: POST to $HA_URL/api/services/$turn_off_service with entity_id=$entity"
local response=$(curl -s -w "\n%{http_code}" -X POST \
-H "Authorization: Bearer $HA_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"entity_id\": \"$entity\"}" \
"$HA_URL/api/services/$turn_off_service" 2>&1)
local http_code=$(echo "$response" | tail -n 1)
local body=$(echo "$response" | head -n -1)
if [[ "$http_code" =~ ^[2][0-9]{2}$ ]]; then
log "Turn off request sent successfully (HTTP $http_code)"
else
log "ERROR: Failed to send turn_off request (HTTP $http_code)"
log "ERROR: Response body: $body"
fi
# Wait 10 seconds
log "Waiting 10 seconds before power-on..."
sleep 10
# Turn on
log "Sending turn_on request to Home Assistant ($turn_on_service)..."
log "DEBUG: POST to $HA_URL/api/services/$turn_on_service with entity_id=$entity"
local response=$(curl -s -w "\n%{http_code}" -X POST \
-H "Authorization: Bearer $HA_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"entity_id\": \"$entity\"}" \
"$HA_URL/api/services/$turn_on_service" 2>&1)
local http_code=$(echo "$response" | tail -n 1)
local body=$(echo "$response" | head -n -1)
if [[ "$http_code" =~ ^[2][0-9]{2}$ ]]; then
log "Turn on request sent successfully (HTTP $http_code)"
log "Power cycle completed for $entity"
else
log "ERROR: Failed to send turn_on request (HTTP $http_code)"
log "ERROR: Response body: $body"
fi
# Reset state
ERROR_START_TIME=""
IN_GRACE_PERIOD=false
}
log "=== Thinkcentre Monitor Started ==="
log "Configuration:"
log " TARGET_URL: $TARGET_URL"
log " HA_URL: $HA_URL"
log " HA_ENTITY: $HA_ENTITY"
log " GRACE_PERIOD: ${GRACE_PERIOD}s ($(( GRACE_PERIOD / 60 )) minutes)"
log " CHECK_INTERVAL: ${CHECK_INTERVAL}s"
# Test HA connection before entering main loop
if ! test_ha_connection; then
log "FATAL: Cannot connect to Home Assistant. Exiting."
exit 1
fi
# Main monitoring loop
while true; do
RESPONSE_CODE=$(check_target)
# Log all responses
log "HTTP Response: $RESPONSE_CODE"
if [[ "$RESPONSE_CODE" == "502" ]]; then
# 502 Bad Gateway error detected
if [[ -z "$ERROR_START_TIME" ]]; then
# First 502 error - start grace period
ERROR_START_TIME=$(date +%s)
IN_GRACE_PERIOD=true
log "502 error detected - starting $(( GRACE_PERIOD / 60 ))- minute grace period (recovery window for deployment scenarios)"
else
# Already in grace period - check if it has expired
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - ERROR_START_TIME))
if [[ $ELAPSED -ge $GRACE_PERIOD ]]; then
# Grace period expired - trigger power cycle
log "Grace period expired after ${ELAPSED}s. Service still unavailable."
trigger_power_cycle "$HA_ENTITY"
ERROR_START_TIME=""
IN_GRACE_PERIOD=false
else
# Still within grace period
REMAINING=$((GRACE_PERIOD - ELAPSED))
log "Still in grace period. Service recovery window: ${REMAINING}s remaining"
fi
fi
else
# Service is responding (any code other than 502)
if [[ "$IN_GRACE_PERIOD" == true ]]; then
# Service recovered during grace period
CURRENT_TIME=$(date +%s)
RECOVERY_TIME=$((CURRENT_TIME - ERROR_START_TIME))
log "Service recovered during grace period after ${RECOVERY_TIME}s. Resetting error state."
ERROR_START_TIME=""
IN_GRACE_PERIOD=false
elif [[ "$RESPONSE_CODE" != "000" ]]; then
# Service is healthy (not a timeout)
if [[ -n "$LAST_RESPONSE_CODE" && "$LAST_RESPONSE_CODE" != "$RESPONSE_CODE" ]]; then
log "Service status changed from $LAST_RESPONSE_CODE to $RESPONSE_CODE"
fi
fi
fi
LAST_RESPONSE_CODE="$RESPONSE_CODE"
# Wait for next check
sleep "$CHECK_INTERVAL"
done