Dockerised and prepared with .env
This commit is contained in:
32
.env.example
Normal file
32
.env.example
Normal file
@@ -0,0 +1,32 @@
|
||||
# Thinkcentre Monitor Configuration
|
||||
# Copy this file to .env and fill in your actual values
|
||||
|
||||
# Target URL to monitor for health checks
|
||||
# This is the service URL that will be checked every CHECK_INTERVAL seconds
|
||||
TARGET_URL=http://kubernetes-service:8080
|
||||
|
||||
# Home Assistant configuration
|
||||
# HA_URL: Base URL of your Home Assistant instance
|
||||
# Example: http://homeassistant:8123 or http://192.168.1.100:8123
|
||||
HA_URL=http://homeassistant:8123
|
||||
|
||||
# HA_TOKEN: Long-lived access token from Home Assistant
|
||||
# To generate: Home Assistant Settings → Developer Tools → Long-Lived Access Tokens
|
||||
# REQUIRED - the script will exit if this is not set
|
||||
HA_TOKEN=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiI5MGUzZjBmZTYwMjc0NWU0YTAwOTdiOTk2MjY1MzU2YyIsImlhdCI6MTczMzQwMjEwMiwiZXhwIjoyMDQ4NzYyMTAyfQ.example_token_replace_with_yours
|
||||
|
||||
# HA_ENTITY: Home Assistant entity ID for the power switch
|
||||
# This is the switch entity that will be toggled to power-cycle the machine
|
||||
# Example: switch.thinkcentre_power or switch.machine_reboot_relay
|
||||
HA_ENTITY=switch.thinkcentre_power
|
||||
|
||||
# Grace period in seconds
|
||||
# When a 502 error is detected, the system waits this long for recovery
|
||||
# (useful for deployment scenarios where services may temporarily be unavailable)
|
||||
# Default: 300 (5 minutes)
|
||||
GRACE_PERIOD=300
|
||||
|
||||
# Health check interval in seconds
|
||||
# How frequently to check the target URL
|
||||
# Default: 30 seconds
|
||||
CHECK_INTERVAL=30
|
||||
22
Dockerfile
Normal file
22
Dockerfile
Normal file
@@ -0,0 +1,22 @@
|
||||
FROM alpine:latest
|
||||
|
||||
# Install required dependencies
|
||||
RUN apk add --no-cache \
|
||||
bash \
|
||||
curl \
|
||||
ca-certificates
|
||||
|
||||
# Create application directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy monitoring script
|
||||
COPY thinkcenter_monitor.sh /app/thinkcenter_monitor.sh
|
||||
|
||||
# Create log directory
|
||||
RUN mkdir -p /var/log
|
||||
|
||||
# Make script executable
|
||||
RUN chmod +x /app/thinkcenter_monitor.sh
|
||||
|
||||
# Set the entrypoint
|
||||
CMD ["/app/thinkcenter_monitor.sh"]
|
||||
243
README.md
243
README.md
@@ -1,3 +1,242 @@
|
||||
# Thinkcentre-watchdog
|
||||
# Thinkcentre Watchdog
|
||||
|
||||
A watchdog daemon for my thinkcentre machines which tend to hang.
|
||||
A Docker-based monitoring solution for detecting and auto-rebooting hung Kubernetes machines via Home Assistant integration.
|
||||
|
||||
## Overview
|
||||
|
||||
This watchdog monitors a target service URL for 502 Bad Gateway errors (indicating a hung machine). When a service fails:
|
||||
|
||||
1. A 5-minute grace period begins (allowing for deployment recoveries)
|
||||
2. If the service recovers within 5 minutes, the error is cleared (normal deployment scenario)
|
||||
3. If still failing after 5 minutes, an automatic power-cycle is triggered via Home Assistant
|
||||
4. The machine powers off for 10 seconds, then powers back on
|
||||
|
||||
All activity is logged with timestamps for monitoring and troubleshooting.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Docker and Docker Compose installed
|
||||
- Home Assistant instance running with network access
|
||||
- A power switch entity configured in Home Assistant
|
||||
- Long-lived access token from Home Assistant
|
||||
|
||||
## Installation
|
||||
|
||||
### 1. Download/Organize Files
|
||||
|
||||
Clone or download this repository to your machine:
|
||||
|
||||
```bash
|
||||
git clone <repository-url>
|
||||
cd Thinkcentre-watchdog
|
||||
```
|
||||
|
||||
The directory should contain:
|
||||
- `Dockerfile` - Container definition
|
||||
- `thinkcenter_monitor.sh` - Monitoring script
|
||||
- `docker-compose.yml` - Docker Compose configuration
|
||||
- `.env.example` - Environment variable template
|
||||
- `README.md` - This file
|
||||
|
||||
### 2. Create Configuration File
|
||||
|
||||
Copy the example environment file and edit it with your actual values:
|
||||
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
Edit `.env` and configure:
|
||||
|
||||
```bash
|
||||
# Your target service URL
|
||||
TARGET_URL=http://your-kubernetes-service:8080
|
||||
|
||||
# Home Assistant configuration
|
||||
HA_URL=http://homeassistant:8123
|
||||
HA_TOKEN=your_long_lived_access_token_here
|
||||
HA_ENTITY=switch.your_power_switch_entity
|
||||
|
||||
# Optional: Adjust timing if needed
|
||||
GRACE_PERIOD=300 # 5 minutes
|
||||
CHECK_INTERVAL=30 # Check every 30 seconds
|
||||
```
|
||||
|
||||
### 3. Generate Home Assistant Token
|
||||
|
||||
1. Open Home Assistant web interface
|
||||
2. Go to **Settings** → **Developer Tools** → **Long-Lived Access Tokens**
|
||||
3. Click **Create Token**
|
||||
4. Name it (e.g., "Thinkcentre Watchdog")
|
||||
5. Copy the token and paste it in your `.env` file as `HA_TOKEN`
|
||||
|
||||
### 4. Configure Power Switch in Home Assistant
|
||||
|
||||
Ensure you have a switch entity in Home Assistant that controls the machine's power. Common options:
|
||||
|
||||
- **Smart Outlet/Relay**: If using a smart power outlet
|
||||
- **IPMI/Redfish**: For datacenter machines
|
||||
- **Smart Plug**: Like Tasmota, Zigbee, or Z-Wave devices
|
||||
|
||||
Configure the entity ID in your `.env` as `HA_ENTITY` (e.g., `switch.thinkcentre_power`)
|
||||
|
||||
### 5. Build and Run
|
||||
|
||||
Start the monitoring container:
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
The container will:
|
||||
- Build from the Dockerfile
|
||||
- Start with `restart: unless-stopped` policy
|
||||
- Mount logs to a named volume
|
||||
- Apply resource limits (0.1 CPU, 64MB memory)
|
||||
|
||||
### 6. View Logs
|
||||
|
||||
Monitor real-time logs:
|
||||
|
||||
```bash
|
||||
docker compose logs -f thinkcenter-monitor
|
||||
```
|
||||
|
||||
Or view persistent logs from the volume:
|
||||
|
||||
```bash
|
||||
docker volume inspect thinkcenter_logs
|
||||
# Look at the Mountpoint directory
|
||||
```
|
||||
|
||||
### 7. Stop or Restart
|
||||
|
||||
Stop the container:
|
||||
|
||||
```bash
|
||||
docker compose down
|
||||
```
|
||||
|
||||
Restart the container:
|
||||
|
||||
```bash
|
||||
docker compose restart thinkcenter-monitor
|
||||
```
|
||||
|
||||
## Deploying Multiple Instances
|
||||
|
||||
To monitor multiple machines:
|
||||
|
||||
### For Machine 2:
|
||||
|
||||
Create a separate directory:
|
||||
|
||||
```bash
|
||||
mkdir thinkcentre-watchdog-machine2
|
||||
cd thinkcentre-watchdog-machine2
|
||||
|
||||
# Copy files
|
||||
cp /path/to/original/* .
|
||||
|
||||
# Create unique .env
|
||||
cp .env.example .env
|
||||
|
||||
# Edit .env for machine 2
|
||||
nano .env
|
||||
# Change: HA_ENTITY=switch.machine2_power
|
||||
# Change: TARGET_URL to machine 2's service URL
|
||||
```
|
||||
|
||||
Then run:
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
### Using Namespace (Alternative)
|
||||
|
||||
Or manage from one directory with unique service names:
|
||||
|
||||
```bash
|
||||
docker compose -f docker-compose.yml -f docker-compose.machine2.yml up -d
|
||||
```
|
||||
|
||||
## Configuration Variables
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `TARGET_URL` | `http://localhost:8080` | Service URL to monitor |
|
||||
| `HA_URL` | `http://homeassistant:8123` | Home Assistant base URL |
|
||||
| `HA_TOKEN` | (required) | Home Assistant long-lived access token |
|
||||
| `HA_ENTITY` | `switch.thinkcentre_power` | Home Assistant switch entity ID |
|
||||
| `GRACE_PERIOD` | `300` | Seconds to wait before power-cycling (5 minutes) |
|
||||
| `CHECK_INTERVAL` | `30` | Seconds between health checks |
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Container won't start
|
||||
|
||||
Check if `HA_TOKEN` is set:
|
||||
```bash
|
||||
docker compose config | grep HA_TOKEN
|
||||
```
|
||||
|
||||
### No logs appearing
|
||||
|
||||
Check the volume mount:
|
||||
```bash
|
||||
docker volume ls | grep thinkcenter_logs
|
||||
docker volume inspect thinkcenter_logs
|
||||
```
|
||||
|
||||
### Power-cycle not triggering
|
||||
|
||||
1. Verify HA_TOKEN is valid (check Home Assistant logs)
|
||||
2. Confirm HA_ENTITY exists in Home Assistant
|
||||
3. Check network connectivity: `docker compose exec thinkcenter-monitor curl -v http://homeassistant:8123`
|
||||
|
||||
### Service not responding correctly
|
||||
|
||||
Test the target URL directly:
|
||||
```bash
|
||||
docker compose exec thinkcenter-monitor curl -v http://your-service:8080
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. **Health Check**: Every `CHECK_INTERVAL` seconds, HTTP response code is checked
|
||||
2. **Grace Period**: First 502 error triggers a 5-minute window for recovery
|
||||
3. **Recovery Detection**: If service returns non-502 during grace period, error resets
|
||||
4. **Power Cycle**: After grace period expires with continued 502s, power cycle triggers:
|
||||
- Send turn_off to HA switch entity
|
||||
- Wait 10 seconds
|
||||
- Send turn_on to HA switch entity
|
||||
5. **Logging**: All events timestamped and logged to `/var/log/thinkcenter_monitor.log`
|
||||
|
||||
## Resource Limits
|
||||
|
||||
- CPU: 0.1 cores (limited to prevent resource hogging)
|
||||
- Memory: 64MB (minimal requirements for bash + curl)
|
||||
- Logging: JSON file driver, max 10MB per file, keeps 3 files (30MB total)
|
||||
|
||||
## Debugging
|
||||
|
||||
Enable verbose output by checking logs with:
|
||||
|
||||
```bash
|
||||
docker compose logs --tail 50 thinkcenter-monitor
|
||||
```
|
||||
|
||||
To test the script locally (without Docker):
|
||||
|
||||
```bash
|
||||
bash thinkcenter_monitor.sh
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
Monitoring solution for Thinkcentre machines.
|
||||
|
||||
## Support
|
||||
|
||||
For issues or improvements, check the logs first and verify all environment variables are correctly set in your `.env` file.
|
||||
|
||||
35
docker-compose.yml
Normal file
35
docker-compose.yml
Normal file
@@ -0,0 +1,35 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
thinkcenter-monitor:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
container_name: thinkcenter-monitor
|
||||
restart: unless-stopped
|
||||
|
||||
environment:
|
||||
TARGET_URL: ${TARGET_URL}
|
||||
HA_URL: ${HA_URL}
|
||||
HA_TOKEN: ${HA_TOKEN}
|
||||
HA_ENTITY: ${HA_ENTITY}
|
||||
GRACE_PERIOD: ${GRACE_PERIOD}
|
||||
CHECK_INTERVAL: ${CHECK_INTERVAL}
|
||||
LOG_FILE: /var/log/thinkcenter_monitor.log
|
||||
|
||||
volumes:
|
||||
- ./logs:/var/log
|
||||
|
||||
# Resource limits
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: '0.1'
|
||||
memory: 64M
|
||||
|
||||
# JSON file logging
|
||||
logging:
|
||||
driver: json-file
|
||||
options:
|
||||
max-file: '3'
|
||||
max-size: '10m'
|
||||
@@ -1,108 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Monitoring script for pandora machine
|
||||
# Checks if vorgabenportal.knowyoursecurity.com returns 502 Bad Gateway (indicates pandora hung)
|
||||
# If 502 detected for >5 minutes, power-cycles the machine via Home Assistant
|
||||
|
||||
# Configuration
|
||||
TARGET_URL="https://vorgabenportal.knowyoursecurity.com"
|
||||
HA_URL="http://homeassistant.local:8123" # Home Assistant URL
|
||||
HA_TOKEN="YOUR_LONG_LIVED_ACCESS_TOKEN" # Get from HA: Profile > Long-Lived Access Tokens
|
||||
HA_ENTITY="switch.pandora_power" # Entity ID of the smart switch/outlet
|
||||
LOG_FILE="/var/log/pandora_monitor.log"
|
||||
MAX_FAILURES=2 # Reboot after 2 consecutive failures
|
||||
FAILURE_COUNT=0
|
||||
FAILURE_START_TIME=0
|
||||
GRACE_PERIOD=300 # 5 minutes grace for deployments
|
||||
CHECK_INTERVAL=60 # Check every 1 minute (more granular during grace period)
|
||||
|
||||
# Function to log messages
|
||||
log_message() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
# Function to check if machine is responsive
|
||||
# If pandora hangs, the web service behind it returns 502 Bad Gateway
|
||||
# Returns the HTTP code for caller to evaluate
|
||||
check_responsive() {
|
||||
local http_code=$(curl -s -o /dev/null -w "%{http_code}" \
|
||||
--max-time 10 \
|
||||
--connect-timeout 5 \
|
||||
"$TARGET_URL" 2>/dev/null)
|
||||
|
||||
echo "$http_code"
|
||||
}
|
||||
|
||||
# Function to power cycle via Home Assistant
|
||||
power_cycle() {
|
||||
log_message "⚠️ ALERT: Service unresponsive for $FAILURE_COUNT consecutive checks! Triggering power cycle..."
|
||||
|
||||
# Turn off the switch
|
||||
curl -s -X POST \
|
||||
-H "Authorization: Bearer $HA_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"entity_id":"'"$HA_ENTITY"'"}' \
|
||||
"$HA_URL/api/services/switch/turn_off" &>/dev/null
|
||||
|
||||
log_message "Power OFF command sent to $HA_ENTITY"
|
||||
sleep 10 # Wait 10 seconds for machine to shut down
|
||||
|
||||
# Turn on the switch
|
||||
curl -s -X POST \
|
||||
-H "Authorization: Bearer $HA_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"entity_id":"'"$HA_ENTITY"'"}' \
|
||||
"$HA_URL/api/services/switch/turn_on" &>/dev/null
|
||||
|
||||
log_message "Power ON command sent to $HA_ENTITY - Pandora rebooting"
|
||||
}
|
||||
|
||||
# Main monitoring loop
|
||||
log_message "========================================"
|
||||
log_message "Starting pandora monitoring service"
|
||||
log_message "Monitoring: $TARGET_URL"
|
||||
log_message "HA Entity: $HA_ENTITY"
|
||||
log_message "Check interval: ${CHECK_INTERVAL}s"
|
||||
log_message "Grace period: ${GRACE_PERIOD}s"
|
||||
log_message "========================================"
|
||||
|
||||
while true; do
|
||||
http_code=$(check_responsive)
|
||||
current_time=$(date +%s)
|
||||
|
||||
# Check if service is OK (not 502 and not timeout)
|
||||
if [ "$http_code" != "502" ] && [ "$http_code" != "000" ]; then
|
||||
# Service is healthy
|
||||
if [ $FAILURE_COUNT -gt 0 ]; then
|
||||
log_message "✓ Service recovered! (was down $FAILURE_COUNT times)"
|
||||
fi
|
||||
FAILURE_COUNT=0
|
||||
FAILURE_START_TIME=0
|
||||
log_message "✓ HTTP $http_code from $TARGET_URL (healthy)"
|
||||
else
|
||||
# Service returned 502 or timeout
|
||||
if [ $FAILURE_COUNT -eq 0 ]; then
|
||||
# First failure - start grace period
|
||||
FAILURE_START_TIME=$current_time
|
||||
FAILURE_COUNT=1
|
||||
log_message "⚠️ First 502/timeout detected (HTTP $http_code) - Starting 5 minute grace period"
|
||||
else
|
||||
# Already in failure state
|
||||
FAILURE_COUNT=$((FAILURE_COUNT + 1))
|
||||
elapsed=$((current_time - FAILURE_START_TIME))
|
||||
remaining=$((GRACE_PERIOD - elapsed))
|
||||
|
||||
if [ $remaining -gt 0 ]; then
|
||||
log_message "⚠️ Still seeing 502 (HTTP $http_code) - Grace period: ${remaining}s remaining"
|
||||
else
|
||||
# Grace period expired - trigger reboot
|
||||
log_message "🔴 CRITICAL: Service unresponsive for >5 minutes! Triggering power cycle..."
|
||||
power_cycle
|
||||
FAILURE_COUNT=0
|
||||
FAILURE_START_TIME=0
|
||||
sleep 120 # Wait 2 minutes after reboot attempt before checking again
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
sleep "$CHECK_INTERVAL"
|
||||
done
|
||||
146
thinkcenter_monitor.sh
Normal file
146
thinkcenter_monitor.sh
Normal file
@@ -0,0 +1,146 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Thinkcentre Monitor - Kubernetes Machine Health Monitor with Home Assistant Integration
|
||||
# Detects hung machines (502 errors) and auto-reboots them via Home Assistant
|
||||
|
||||
# Configuration from environment variables with sensible defaults
|
||||
TARGET_URL="${TARGET_URL:-http://localhost:8080}"
|
||||
HA_URL="${HA_URL:-http://homeassistant:8123}"
|
||||
HA_TOKEN="${HA_TOKEN}"
|
||||
HA_ENTITY="${HA_ENTITY:-switch.thinkcentre_power}"
|
||||
LOG_FILE="${LOG_FILE:-/var/log/thinkcenter_monitor.log}"
|
||||
GRACE_PERIOD="${GRACE_PERIOD:-300}" # 5 minutes in seconds
|
||||
CHECK_INTERVAL="${CHECK_INTERVAL:-30}" # 30 seconds between checks
|
||||
|
||||
# State variables
|
||||
ERROR_START_TIME=""
|
||||
IN_GRACE_PERIOD=false
|
||||
LAST_RESPONSE_CODE=""
|
||||
|
||||
# Validate required configuration
|
||||
if [[ -z "$HA_TOKEN" ]]; then
|
||||
echo "ERROR: HA_TOKEN environment variable is not set. Exiting." | tee -a "$LOG_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Initialize log file
|
||||
mkdir -p "$(dirname "$LOG_FILE")"
|
||||
touch "$LOG_FILE"
|
||||
|
||||
log() {
|
||||
local message="$1"
|
||||
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
echo "[$timestamp] $message" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
check_target() {
|
||||
# Use curl to get HTTP response code without following redirects
|
||||
local response_code=$(curl -s -o /dev/null -w "%{http_code}" -m 5 "$TARGET_URL" 2>/dev/null)
|
||||
echo "$response_code"
|
||||
}
|
||||
|
||||
trigger_power_cycle() {
|
||||
local entity="$1"
|
||||
|
||||
log "ALERT: Triggering power cycle for entity: $entity"
|
||||
|
||||
# Turn off
|
||||
log "Sending turn_off request to Home Assistant..."
|
||||
curl -s -X POST \
|
||||
-H "Authorization: Bearer $HA_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"entity_id\": \"$entity\"}" \
|
||||
"$HA_URL/api/services/switch/turn_off" > /dev/null 2>&1
|
||||
|
||||
if [[ $? -eq 0 ]]; then
|
||||
log "Turn off request sent successfully"
|
||||
else
|
||||
log "ERROR: Failed to send turn_off request"
|
||||
fi
|
||||
|
||||
# Wait 10 seconds
|
||||
log "Waiting 10 seconds before power-on..."
|
||||
sleep 10
|
||||
|
||||
# Turn on
|
||||
log "Sending turn_on request to Home Assistant..."
|
||||
curl -s -X POST \
|
||||
-H "Authorization: Bearer $HA_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"entity_id\": \"$entity\"}" \
|
||||
"$HA_URL/api/services/switch/turn_on" > /dev/null 2>&1
|
||||
|
||||
if [[ $? -eq 0 ]]; then
|
||||
log "Turn on request sent successfully"
|
||||
log "Power cycle completed for $entity"
|
||||
else
|
||||
log "ERROR: Failed to send turn_on request"
|
||||
fi
|
||||
|
||||
# Reset state
|
||||
ERROR_START_TIME=""
|
||||
IN_GRACE_PERIOD=false
|
||||
}
|
||||
|
||||
log "=== Thinkcentre Monitor Started ==="
|
||||
log "Configuration:"
|
||||
log " TARGET_URL: $TARGET_URL"
|
||||
log " HA_URL: $HA_URL"
|
||||
log " HA_ENTITY: $HA_ENTITY"
|
||||
log " GRACE_PERIOD: ${GRACE_PERIOD}s ($(( GRACE_PERIOD / 60 )) minutes)"
|
||||
log " CHECK_INTERVAL: ${CHECK_INTERVAL}s"
|
||||
|
||||
# Main monitoring loop
|
||||
while true; do
|
||||
RESPONSE_CODE=$(check_target)
|
||||
|
||||
# Log all responses
|
||||
log "HTTP Response: $RESPONSE_CODE"
|
||||
|
||||
if [[ "$RESPONSE_CODE" == "502" ]]; then
|
||||
# 502 Bad Gateway error detected
|
||||
|
||||
if [[ -z "$ERROR_START_TIME" ]]; then
|
||||
# First 502 error - start grace period
|
||||
ERROR_START_TIME=$(date +%s)
|
||||
IN_GRACE_PERIOD=true
|
||||
log "502 error detected - starting 5-minute grace period (recovery window for deployment scenarios)"
|
||||
else
|
||||
# Already in grace period - check if it has expired
|
||||
CURRENT_TIME=$(date +%s)
|
||||
ELAPSED=$((CURRENT_TIME - ERROR_START_TIME))
|
||||
|
||||
if [[ $ELAPSED -ge $GRACE_PERIOD ]]; then
|
||||
# Grace period expired - trigger power cycle
|
||||
log "Grace period expired after ${ELAPSED}s. Service still unavailable."
|
||||
trigger_power_cycle "$HA_ENTITY"
|
||||
ERROR_START_TIME=""
|
||||
IN_GRACE_PERIOD=false
|
||||
else
|
||||
# Still within grace period
|
||||
REMAINING=$((GRACE_PERIOD - ELAPSED))
|
||||
log "Still in grace period. Service recovery window: ${REMAINING}s remaining"
|
||||
fi
|
||||
fi
|
||||
else
|
||||
# Service is responding (any code other than 502)
|
||||
if [[ "$IN_GRACE_PERIOD" == true ]]; then
|
||||
# Service recovered during grace period
|
||||
CURRENT_TIME=$(date +%s)
|
||||
RECOVERY_TIME=$((CURRENT_TIME - ERROR_START_TIME))
|
||||
log "Service recovered during grace period after ${RECOVERY_TIME}s. Resetting error state."
|
||||
ERROR_START_TIME=""
|
||||
IN_GRACE_PERIOD=false
|
||||
elif [[ "$RESPONSE_CODE" != "000" ]]; then
|
||||
# Service is healthy (not a timeout)
|
||||
if [[ -n "$LAST_RESPONSE_CODE" && "$LAST_RESPONSE_CODE" != "$RESPONSE_CODE" ]]; then
|
||||
log "Service status changed from $LAST_RESPONSE_CODE to $RESPONSE_CODE"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
LAST_RESPONSE_CODE="$RESPONSE_CODE"
|
||||
|
||||
# Wait for next check
|
||||
sleep "$CHECK_INTERVAL"
|
||||
done
|
||||
Reference in New Issue
Block a user