Compare commits
2 Commits
058eeb5246
...
257e4c2062
| Author | SHA1 | Date | |
|---|---|---|---|
| 257e4c2062 | |||
| a3b13c5fe6 |
32
.env.example
Normal file
32
.env.example
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
# Thinkcentre Monitor Configuration
|
||||||
|
# Copy this file to .env and fill in your actual values
|
||||||
|
|
||||||
|
# Target URL to monitor for health checks
|
||||||
|
# This is the service URL that will be checked every CHECK_INTERVAL seconds
|
||||||
|
TARGET_URL=http://kubernetes-service:8080
|
||||||
|
|
||||||
|
# Home Assistant configuration
|
||||||
|
# HA_URL: Base URL of your Home Assistant instance
|
||||||
|
# Example: http://homeassistant:8123 or http://192.168.1.100:8123
|
||||||
|
HA_URL=http://homeassistant:8123
|
||||||
|
|
||||||
|
# HA_TOKEN: Long-lived access token from Home Assistant
|
||||||
|
# To generate: Home Assistant Settings → Developer Tools → Long-Lived Access Tokens
|
||||||
|
# REQUIRED - the script will exit if this is not set
|
||||||
|
HA_TOKEN=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiI5MGUzZjBmZTYwMjc0NWU0YTAwOTdiOTk2MjY1MzU2YyIsImlhdCI6MTczMzQwMjEwMiwiZXhwIjoyMDQ4NzYyMTAyfQ.example_token_replace_with_yours
|
||||||
|
|
||||||
|
# HA_ENTITY: Home Assistant entity ID for the power switch
|
||||||
|
# This is the switch entity that will be toggled to power-cycle the machine
|
||||||
|
# Example: switch.thinkcentre_power or switch.machine_reboot_relay
|
||||||
|
HA_ENTITY=switch.thinkcentre_power
|
||||||
|
|
||||||
|
# Grace period in seconds
|
||||||
|
# When a 502 error is detected, the system waits this long for recovery
|
||||||
|
# (useful for deployment scenarios where services may temporarily be unavailable)
|
||||||
|
# Default: 300 (5 minutes)
|
||||||
|
GRACE_PERIOD=300
|
||||||
|
|
||||||
|
# Health check interval in seconds
|
||||||
|
# How frequently to check the target URL
|
||||||
|
# Default: 30 seconds
|
||||||
|
CHECK_INTERVAL=30
|
||||||
22
Dockerfile
Normal file
22
Dockerfile
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
FROM alpine:latest
|
||||||
|
|
||||||
|
# Install required dependencies
|
||||||
|
RUN apk add --no-cache \
|
||||||
|
bash \
|
||||||
|
curl \
|
||||||
|
ca-certificates
|
||||||
|
|
||||||
|
# Create application directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy monitoring script
|
||||||
|
COPY thinkcenter_monitor.sh /app/thinkcenter_monitor.sh
|
||||||
|
|
||||||
|
# Create log directory
|
||||||
|
RUN mkdir -p /var/log
|
||||||
|
|
||||||
|
# Make script executable
|
||||||
|
RUN chmod +x /app/thinkcenter_monitor.sh
|
||||||
|
|
||||||
|
# Set the entrypoint
|
||||||
|
CMD ["/app/thinkcenter_monitor.sh"]
|
||||||
243
README.md
243
README.md
@@ -1,3 +1,242 @@
|
|||||||
# Thinkcentre-watchdog
|
# Thinkcentre Watchdog
|
||||||
|
|
||||||
A watchdog daemon for my thinkcentre machines which tend to hang.
|
A Docker-based monitoring solution for detecting and auto-rebooting hung Kubernetes machines via Home Assistant integration.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This watchdog monitors a target service URL for 502 Bad Gateway errors (indicating a hung machine). When a service fails:
|
||||||
|
|
||||||
|
1. A 5-minute grace period begins (allowing for deployment recoveries)
|
||||||
|
2. If the service recovers within 5 minutes, the error is cleared (normal deployment scenario)
|
||||||
|
3. If still failing after 5 minutes, an automatic power-cycle is triggered via Home Assistant
|
||||||
|
4. The machine powers off for 10 seconds, then powers back on
|
||||||
|
|
||||||
|
All activity is logged with timestamps for monitoring and troubleshooting.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Docker and Docker Compose installed
|
||||||
|
- Home Assistant instance running with network access
|
||||||
|
- A power switch entity configured in Home Assistant
|
||||||
|
- Long-lived access token from Home Assistant
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### 1. Download/Organize Files
|
||||||
|
|
||||||
|
Clone or download this repository to your machine:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone <repository-url>
|
||||||
|
cd Thinkcentre-watchdog
|
||||||
|
```
|
||||||
|
|
||||||
|
The directory should contain:
|
||||||
|
- `Dockerfile` - Container definition
|
||||||
|
- `thinkcenter_monitor.sh` - Monitoring script
|
||||||
|
- `docker-compose.yml` - Docker Compose configuration
|
||||||
|
- `.env.example` - Environment variable template
|
||||||
|
- `README.md` - This file
|
||||||
|
|
||||||
|
### 2. Create Configuration File
|
||||||
|
|
||||||
|
Copy the example environment file and edit it with your actual values:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
```
|
||||||
|
|
||||||
|
Edit `.env` and configure:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Your target service URL
|
||||||
|
TARGET_URL=http://your-kubernetes-service:8080
|
||||||
|
|
||||||
|
# Home Assistant configuration
|
||||||
|
HA_URL=http://homeassistant:8123
|
||||||
|
HA_TOKEN=your_long_lived_access_token_here
|
||||||
|
HA_ENTITY=switch.your_power_switch_entity
|
||||||
|
|
||||||
|
# Optional: Adjust timing if needed
|
||||||
|
GRACE_PERIOD=300 # 5 minutes
|
||||||
|
CHECK_INTERVAL=30 # Check every 30 seconds
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Generate Home Assistant Token
|
||||||
|
|
||||||
|
1. Open Home Assistant web interface
|
||||||
|
2. Go to **Settings** → **Developer Tools** → **Long-Lived Access Tokens**
|
||||||
|
3. Click **Create Token**
|
||||||
|
4. Name it (e.g., "Thinkcentre Watchdog")
|
||||||
|
5. Copy the token and paste it in your `.env` file as `HA_TOKEN`
|
||||||
|
|
||||||
|
### 4. Configure Power Switch in Home Assistant
|
||||||
|
|
||||||
|
Ensure you have a switch entity in Home Assistant that controls the machine's power. Common options:
|
||||||
|
|
||||||
|
- **Smart Outlet/Relay**: If using a smart power outlet
|
||||||
|
- **IPMI/Redfish**: For datacenter machines
|
||||||
|
- **Smart Plug**: Like Tasmota, Zigbee, or Z-Wave devices
|
||||||
|
|
||||||
|
Configure the entity ID in your `.env` as `HA_ENTITY` (e.g., `switch.thinkcentre_power`)
|
||||||
|
|
||||||
|
### 5. Build and Run
|
||||||
|
|
||||||
|
Start the monitoring container:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
The container will:
|
||||||
|
- Build from the Dockerfile
|
||||||
|
- Start with `restart: unless-stopped` policy
|
||||||
|
- Mount logs to a named volume
|
||||||
|
- Apply resource limits (0.1 CPU, 64MB memory)
|
||||||
|
|
||||||
|
### 6. View Logs
|
||||||
|
|
||||||
|
Monitor real-time logs:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose logs -f thinkcenter-monitor
|
||||||
|
```
|
||||||
|
|
||||||
|
Or view persistent logs from the volume:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker volume inspect thinkcenter_logs
|
||||||
|
# Look at the Mountpoint directory
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7. Stop or Restart
|
||||||
|
|
||||||
|
Stop the container:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose down
|
||||||
|
```
|
||||||
|
|
||||||
|
Restart the container:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose restart thinkcenter-monitor
|
||||||
|
```
|
||||||
|
|
||||||
|
## Deploying Multiple Instances
|
||||||
|
|
||||||
|
To monitor multiple machines:
|
||||||
|
|
||||||
|
### For Machine 2:
|
||||||
|
|
||||||
|
Create a separate directory:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir thinkcentre-watchdog-machine2
|
||||||
|
cd thinkcentre-watchdog-machine2
|
||||||
|
|
||||||
|
# Copy files
|
||||||
|
cp /path/to/original/* .
|
||||||
|
|
||||||
|
# Create unique .env
|
||||||
|
cp .env.example .env
|
||||||
|
|
||||||
|
# Edit .env for machine 2
|
||||||
|
nano .env
|
||||||
|
# Change: HA_ENTITY=switch.machine2_power
|
||||||
|
# Change: TARGET_URL to machine 2's service URL
|
||||||
|
```
|
||||||
|
|
||||||
|
Then run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
### Using Namespace (Alternative)
|
||||||
|
|
||||||
|
Or manage from one directory with unique service names:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose -f docker-compose.yml -f docker-compose.machine2.yml up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration Variables
|
||||||
|
|
||||||
|
| Variable | Default | Description |
|
||||||
|
|----------|---------|-------------|
|
||||||
|
| `TARGET_URL` | `http://localhost:8080` | Service URL to monitor |
|
||||||
|
| `HA_URL` | `http://homeassistant:8123` | Home Assistant base URL |
|
||||||
|
| `HA_TOKEN` | (required) | Home Assistant long-lived access token |
|
||||||
|
| `HA_ENTITY` | `switch.thinkcentre_power` | Home Assistant switch entity ID |
|
||||||
|
| `GRACE_PERIOD` | `300` | Seconds to wait before power-cycling (5 minutes) |
|
||||||
|
| `CHECK_INTERVAL` | `30` | Seconds between health checks |
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Container won't start
|
||||||
|
|
||||||
|
Check if `HA_TOKEN` is set:
|
||||||
|
```bash
|
||||||
|
docker compose config | grep HA_TOKEN
|
||||||
|
```
|
||||||
|
|
||||||
|
### No logs appearing
|
||||||
|
|
||||||
|
Check the volume mount:
|
||||||
|
```bash
|
||||||
|
docker volume ls | grep thinkcenter_logs
|
||||||
|
docker volume inspect thinkcenter_logs
|
||||||
|
```
|
||||||
|
|
||||||
|
### Power-cycle not triggering
|
||||||
|
|
||||||
|
1. Verify HA_TOKEN is valid (check Home Assistant logs)
|
||||||
|
2. Confirm HA_ENTITY exists in Home Assistant
|
||||||
|
3. Check network connectivity: `docker compose exec thinkcenter-monitor curl -v http://homeassistant:8123`
|
||||||
|
|
||||||
|
### Service not responding correctly
|
||||||
|
|
||||||
|
Test the target URL directly:
|
||||||
|
```bash
|
||||||
|
docker compose exec thinkcenter-monitor curl -v http://your-service:8080
|
||||||
|
```
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
1. **Health Check**: Every `CHECK_INTERVAL` seconds, HTTP response code is checked
|
||||||
|
2. **Grace Period**: First 502 error triggers a 5-minute window for recovery
|
||||||
|
3. **Recovery Detection**: If service returns non-502 during grace period, error resets
|
||||||
|
4. **Power Cycle**: After grace period expires with continued 502s, power cycle triggers:
|
||||||
|
- Send turn_off to HA switch entity
|
||||||
|
- Wait 10 seconds
|
||||||
|
- Send turn_on to HA switch entity
|
||||||
|
5. **Logging**: All events timestamped and logged to `/var/log/thinkcenter_monitor.log`
|
||||||
|
|
||||||
|
## Resource Limits
|
||||||
|
|
||||||
|
- CPU: 0.1 cores (limited to prevent resource hogging)
|
||||||
|
- Memory: 64MB (minimal requirements for bash + curl)
|
||||||
|
- Logging: JSON file driver, max 10MB per file, keeps 3 files (30MB total)
|
||||||
|
|
||||||
|
## Debugging
|
||||||
|
|
||||||
|
Enable verbose output by checking logs with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose logs --tail 50 thinkcenter-monitor
|
||||||
|
```
|
||||||
|
|
||||||
|
To test the script locally (without Docker):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash thinkcenter_monitor.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
Monitoring solution for Thinkcentre machines.
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
For issues or improvements, check the logs first and verify all environment variables are correctly set in your `.env` file.
|
||||||
|
|||||||
35
docker-compose.yml
Normal file
35
docker-compose.yml
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
thinkcenter-monitor:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: thinkcenter-monitor
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
environment:
|
||||||
|
TARGET_URL: ${TARGET_URL}
|
||||||
|
HA_URL: ${HA_URL}
|
||||||
|
HA_TOKEN: ${HA_TOKEN}
|
||||||
|
HA_ENTITY: ${HA_ENTITY}
|
||||||
|
GRACE_PERIOD: ${GRACE_PERIOD}
|
||||||
|
CHECK_INTERVAL: ${CHECK_INTERVAL}
|
||||||
|
LOG_FILE: /var/log/thinkcenter_monitor.log
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
- ./logs:/var/log
|
||||||
|
|
||||||
|
# Resource limits
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: '0.1'
|
||||||
|
memory: 64M
|
||||||
|
|
||||||
|
# JSON file logging
|
||||||
|
logging:
|
||||||
|
driver: json-file
|
||||||
|
options:
|
||||||
|
max-file: '3'
|
||||||
|
max-size: '10m'
|
||||||
146
thinkcenter_monitor.sh
Normal file
146
thinkcenter_monitor.sh
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Thinkcentre Monitor - Kubernetes Machine Health Monitor with Home Assistant Integration
|
||||||
|
# Detects hung machines (502 errors) and auto-reboots them via Home Assistant
|
||||||
|
|
||||||
|
# Configuration from environment variables with sensible defaults
|
||||||
|
TARGET_URL="${TARGET_URL:-http://localhost:8080}"
|
||||||
|
HA_URL="${HA_URL:-http://homeassistant:8123}"
|
||||||
|
HA_TOKEN="${HA_TOKEN}"
|
||||||
|
HA_ENTITY="${HA_ENTITY:-switch.thinkcentre_power}"
|
||||||
|
LOG_FILE="${LOG_FILE:-/var/log/thinkcenter_monitor.log}"
|
||||||
|
GRACE_PERIOD="${GRACE_PERIOD:-300}" # 5 minutes in seconds
|
||||||
|
CHECK_INTERVAL="${CHECK_INTERVAL:-30}" # 30 seconds between checks
|
||||||
|
|
||||||
|
# State variables
|
||||||
|
ERROR_START_TIME=""
|
||||||
|
IN_GRACE_PERIOD=false
|
||||||
|
LAST_RESPONSE_CODE=""
|
||||||
|
|
||||||
|
# Validate required configuration
|
||||||
|
if [[ -z "$HA_TOKEN" ]]; then
|
||||||
|
echo "ERROR: HA_TOKEN environment variable is not set. Exiting." | tee -a "$LOG_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Initialize log file
|
||||||
|
mkdir -p "$(dirname "$LOG_FILE")"
|
||||||
|
touch "$LOG_FILE"
|
||||||
|
|
||||||
|
log() {
|
||||||
|
local message="$1"
|
||||||
|
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
|
||||||
|
echo "[$timestamp] $message" | tee -a "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
check_target() {
|
||||||
|
# Use curl to get HTTP response code without following redirects
|
||||||
|
local response_code=$(curl -s -o /dev/null -w "%{http_code}" -m 5 "$TARGET_URL" 2>/dev/null)
|
||||||
|
echo "$response_code"
|
||||||
|
}
|
||||||
|
|
||||||
|
trigger_power_cycle() {
|
||||||
|
local entity="$1"
|
||||||
|
|
||||||
|
log "ALERT: Triggering power cycle for entity: $entity"
|
||||||
|
|
||||||
|
# Turn off
|
||||||
|
log "Sending turn_off request to Home Assistant..."
|
||||||
|
curl -s -X POST \
|
||||||
|
-H "Authorization: Bearer $HA_TOKEN" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{\"entity_id\": \"$entity\"}" \
|
||||||
|
"$HA_URL/api/services/switch/turn_off" > /dev/null 2>&1
|
||||||
|
|
||||||
|
if [[ $? -eq 0 ]]; then
|
||||||
|
log "Turn off request sent successfully"
|
||||||
|
else
|
||||||
|
log "ERROR: Failed to send turn_off request"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Wait 10 seconds
|
||||||
|
log "Waiting 10 seconds before power-on..."
|
||||||
|
sleep 10
|
||||||
|
|
||||||
|
# Turn on
|
||||||
|
log "Sending turn_on request to Home Assistant..."
|
||||||
|
curl -s -X POST \
|
||||||
|
-H "Authorization: Bearer $HA_TOKEN" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{\"entity_id\": \"$entity\"}" \
|
||||||
|
"$HA_URL/api/services/switch/turn_on" > /dev/null 2>&1
|
||||||
|
|
||||||
|
if [[ $? -eq 0 ]]; then
|
||||||
|
log "Turn on request sent successfully"
|
||||||
|
log "Power cycle completed for $entity"
|
||||||
|
else
|
||||||
|
log "ERROR: Failed to send turn_on request"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Reset state
|
||||||
|
ERROR_START_TIME=""
|
||||||
|
IN_GRACE_PERIOD=false
|
||||||
|
}
|
||||||
|
|
||||||
|
log "=== Thinkcentre Monitor Started ==="
|
||||||
|
log "Configuration:"
|
||||||
|
log " TARGET_URL: $TARGET_URL"
|
||||||
|
log " HA_URL: $HA_URL"
|
||||||
|
log " HA_ENTITY: $HA_ENTITY"
|
||||||
|
log " GRACE_PERIOD: ${GRACE_PERIOD}s ($(( GRACE_PERIOD / 60 )) minutes)"
|
||||||
|
log " CHECK_INTERVAL: ${CHECK_INTERVAL}s"
|
||||||
|
|
||||||
|
# Main monitoring loop
|
||||||
|
while true; do
|
||||||
|
RESPONSE_CODE=$(check_target)
|
||||||
|
|
||||||
|
# Log all responses
|
||||||
|
log "HTTP Response: $RESPONSE_CODE"
|
||||||
|
|
||||||
|
if [[ "$RESPONSE_CODE" == "502" ]]; then
|
||||||
|
# 502 Bad Gateway error detected
|
||||||
|
|
||||||
|
if [[ -z "$ERROR_START_TIME" ]]; then
|
||||||
|
# First 502 error - start grace period
|
||||||
|
ERROR_START_TIME=$(date +%s)
|
||||||
|
IN_GRACE_PERIOD=true
|
||||||
|
log "502 error detected - starting 5-minute grace period (recovery window for deployment scenarios)"
|
||||||
|
else
|
||||||
|
# Already in grace period - check if it has expired
|
||||||
|
CURRENT_TIME=$(date +%s)
|
||||||
|
ELAPSED=$((CURRENT_TIME - ERROR_START_TIME))
|
||||||
|
|
||||||
|
if [[ $ELAPSED -ge $GRACE_PERIOD ]]; then
|
||||||
|
# Grace period expired - trigger power cycle
|
||||||
|
log "Grace period expired after ${ELAPSED}s. Service still unavailable."
|
||||||
|
trigger_power_cycle "$HA_ENTITY"
|
||||||
|
ERROR_START_TIME=""
|
||||||
|
IN_GRACE_PERIOD=false
|
||||||
|
else
|
||||||
|
# Still within grace period
|
||||||
|
REMAINING=$((GRACE_PERIOD - ELAPSED))
|
||||||
|
log "Still in grace period. Service recovery window: ${REMAINING}s remaining"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
# Service is responding (any code other than 502)
|
||||||
|
if [[ "$IN_GRACE_PERIOD" == true ]]; then
|
||||||
|
# Service recovered during grace period
|
||||||
|
CURRENT_TIME=$(date +%s)
|
||||||
|
RECOVERY_TIME=$((CURRENT_TIME - ERROR_START_TIME))
|
||||||
|
log "Service recovered during grace period after ${RECOVERY_TIME}s. Resetting error state."
|
||||||
|
ERROR_START_TIME=""
|
||||||
|
IN_GRACE_PERIOD=false
|
||||||
|
elif [[ "$RESPONSE_CODE" != "000" ]]; then
|
||||||
|
# Service is healthy (not a timeout)
|
||||||
|
if [[ -n "$LAST_RESPONSE_CODE" && "$LAST_RESPONSE_CODE" != "$RESPONSE_CODE" ]]; then
|
||||||
|
log "Service status changed from $LAST_RESPONSE_CODE to $RESPONSE_CODE"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
LAST_RESPONSE_CODE="$RESPONSE_CODE"
|
||||||
|
|
||||||
|
# Wait for next check
|
||||||
|
sleep "$CHECK_INTERVAL"
|
||||||
|
done
|
||||||
Reference in New Issue
Block a user