Dockerised and prepared with .env
This commit is contained in:
146
thinkcenter_monitor.sh
Normal file
146
thinkcenter_monitor.sh
Normal file
@@ -0,0 +1,146 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Thinkcentre Monitor - Kubernetes Machine Health Monitor with Home Assistant Integration
|
||||
# Detects hung machines (502 errors) and auto-reboots them via Home Assistant
|
||||
|
||||
# Configuration from environment variables with sensible defaults
|
||||
TARGET_URL="${TARGET_URL:-http://localhost:8080}"
|
||||
HA_URL="${HA_URL:-http://homeassistant:8123}"
|
||||
HA_TOKEN="${HA_TOKEN}"
|
||||
HA_ENTITY="${HA_ENTITY:-switch.thinkcentre_power}"
|
||||
LOG_FILE="${LOG_FILE:-/var/log/thinkcenter_monitor.log}"
|
||||
GRACE_PERIOD="${GRACE_PERIOD:-300}" # 5 minutes in seconds
|
||||
CHECK_INTERVAL="${CHECK_INTERVAL:-30}" # 30 seconds between checks
|
||||
|
||||
# State variables
|
||||
ERROR_START_TIME=""
|
||||
IN_GRACE_PERIOD=false
|
||||
LAST_RESPONSE_CODE=""
|
||||
|
||||
# Validate required configuration
|
||||
if [[ -z "$HA_TOKEN" ]]; then
|
||||
echo "ERROR: HA_TOKEN environment variable is not set. Exiting." | tee -a "$LOG_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Initialize log file
|
||||
mkdir -p "$(dirname "$LOG_FILE")"
|
||||
touch "$LOG_FILE"
|
||||
|
||||
log() {
|
||||
local message="$1"
|
||||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
echo "[$timestamp] $message" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
check_target() {
|
||||
# Use curl to get HTTP response code without following redirects
|
||||
local response_code=$(curl -s -o /dev/null -w "%{http_code}" -m 5 "$TARGET_URL" 2>/dev/null)
|
||||
echo "$response_code"
|
||||
}
|
||||
|
||||
trigger_power_cycle() {
|
||||
local entity="$1"
|
||||
|
||||
log "ALERT: Triggering power cycle for entity: $entity"
|
||||
|
||||
# Turn off
|
||||
log "Sending turn_off request to Home Assistant..."
|
||||
curl -s -X POST \
|
||||
-H "Authorization: Bearer $HA_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"entity_id\": \"$entity\"}" \
|
||||
"$HA_URL/api/services/switch/turn_off" > /dev/null 2>&1
|
||||
|
||||
if [[ $? -eq 0 ]]; then
|
||||
log "Turn off request sent successfully"
|
||||
else
|
||||
log "ERROR: Failed to send turn_off request"
|
||||
fi
|
||||
|
||||
# Wait 10 seconds
|
||||
log "Waiting 10 seconds before power-on..."
|
||||
sleep 10
|
||||
|
||||
# Turn on
|
||||
log "Sending turn_on request to Home Assistant..."
|
||||
curl -s -X POST \
|
||||
-H "Authorization: Bearer $HA_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"entity_id\": \"$entity\"}" \
|
||||
"$HA_URL/api/services/switch/turn_on" > /dev/null 2>&1
|
||||
|
||||
if [[ $? -eq 0 ]]; then
|
||||
log "Turn on request sent successfully"
|
||||
log "Power cycle completed for $entity"
|
||||
else
|
||||
log "ERROR: Failed to send turn_on request"
|
||||
fi
|
||||
|
||||
# Reset state
|
||||
ERROR_START_TIME=""
|
||||
IN_GRACE_PERIOD=false
|
||||
}
|
||||
|
||||
log "=== Thinkcentre Monitor Started ==="
|
||||
log "Configuration:"
|
||||
log " TARGET_URL: $TARGET_URL"
|
||||
log " HA_URL: $HA_URL"
|
||||
log " HA_ENTITY: $HA_ENTITY"
|
||||
log " GRACE_PERIOD: ${GRACE_PERIOD}s ($(( GRACE_PERIOD / 60 )) minutes)"
|
||||
log " CHECK_INTERVAL: ${CHECK_INTERVAL}s"
|
||||
|
||||
# Main monitoring loop
|
||||
while true; do
|
||||
RESPONSE_CODE=$(check_target)
|
||||
|
||||
# Log all responses
|
||||
log "HTTP Response: $RESPONSE_CODE"
|
||||
|
||||
if [[ "$RESPONSE_CODE" == "502" ]]; then
|
||||
# 502 Bad Gateway error detected
|
||||
|
||||
if [[ -z "$ERROR_START_TIME" ]]; then
|
||||
# First 502 error - start grace period
|
||||
ERROR_START_TIME=$(date +%s)
|
||||
IN_GRACE_PERIOD=true
|
||||
log "502 error detected - starting 5-minute grace period (recovery window for deployment scenarios)"
|
||||
else
|
||||
# Already in grace period - check if it has expired
|
||||
CURRENT_TIME=$(date +%s)
|
||||
ELAPSED=$((CURRENT_TIME - ERROR_START_TIME))
|
||||
|
||||
if [[ $ELAPSED -ge $GRACE_PERIOD ]]; then
|
||||
# Grace period expired - trigger power cycle
|
||||
log "Grace period expired after ${ELAPSED}s. Service still unavailable."
|
||||
trigger_power_cycle "$HA_ENTITY"
|
||||
ERROR_START_TIME=""
|
||||
IN_GRACE_PERIOD=false
|
||||
else
|
||||
# Still within grace period
|
||||
REMAINING=$((GRACE_PERIOD - ELAPSED))
|
||||
log "Still in grace period. Service recovery window: ${REMAINING}s remaining"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
# Service is responding (any code other than 502)
|
||||
if [[ "$IN_GRACE_PERIOD" == true ]]; then
|
||||
# Service recovered during grace period
|
||||
CURRENT_TIME=$(date +%s)
|
||||
RECOVERY_TIME=$((CURRENT_TIME - ERROR_START_TIME))
|
||||
log "Service recovered during grace period after ${RECOVERY_TIME}s. Resetting error state."
|
||||
ERROR_START_TIME=""
|
||||
IN_GRACE_PERIOD=false
|
||||
elif [[ "$RESPONSE_CODE" != "000" ]]; then
|
||||
# Service is healthy (not a timeout)
|
||||
if [[ -n "$LAST_RESPONSE_CODE" && "$LAST_RESPONSE_CODE" != "$RESPONSE_CODE" ]]; then
|
||||
log "Service status changed from $LAST_RESPONSE_CODE to $RESPONSE_CODE"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
LAST_RESPONSE_CODE="$RESPONSE_CODE"
|
||||
|
||||
# Wait for next check
|
||||
sleep "$CHECK_INTERVAL"
|
||||
done
|
||||
Reference in New Issue
Block a user