diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..13da308 --- /dev/null +++ b/.env.example @@ -0,0 +1,32 @@ +# Thinkcentre Monitor Configuration +# Copy this file to .env and fill in your actual values + +# Target URL to monitor for health checks +# This is the service URL that will be checked every CHECK_INTERVAL seconds +TARGET_URL=http://kubernetes-service:8080 + +# Home Assistant configuration +# HA_URL: Base URL of your Home Assistant instance +# Example: http://homeassistant:8123 or http://192.168.1.100:8123 +HA_URL=http://homeassistant:8123 + +# HA_TOKEN: Long-lived access token from Home Assistant +# To generate: Home Assistant Settings → Developer Tools → Long-Lived Access Tokens +# REQUIRED - the script will exit if this is not set +HA_TOKEN=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiI5MGUzZjBmZTYwMjc0NWU0YTAwOTdiOTk2MjY1MzU2YyIsImlhdCI6MTczMzQwMjEwMiwiZXhwIjoyMDQ4NzYyMTAyfQ.example_token_replace_with_yours + +# HA_ENTITY: Home Assistant entity ID for the power switch +# This is the switch entity that will be toggled to power-cycle the machine +# Example: switch.thinkcentre_power or switch.machine_reboot_relay +HA_ENTITY=switch.thinkcentre_power + +# Grace period in seconds +# When a 502 error is detected, the system waits this long for recovery +# (useful for deployment scenarios where services may temporarily be unavailable) +# Default: 300 (5 minutes) +GRACE_PERIOD=300 + +# Health check interval in seconds +# How frequently to check the target URL +# Default: 30 seconds +CHECK_INTERVAL=30 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c6791ed --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM alpine:latest + +# Install required dependencies +RUN apk add --no-cache \ + bash \ + curl \ + ca-certificates + +# Create application directory +WORKDIR /app + +# Copy monitoring script +COPY thinkcenter_monitor.sh /app/thinkcenter_monitor.sh + +# Create log directory +RUN mkdir -p /var/log + +# Make script executable +RUN chmod +x /app/thinkcenter_monitor.sh + +# Set the entrypoint +CMD ["/app/thinkcenter_monitor.sh"] diff --git a/README.md b/README.md index 469889d..f0dbebe 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,242 @@ -# Thinkcentre-watchdog +# Thinkcentre Watchdog -A watchdog daemon for my thinkcentre machines which tend to hang. \ No newline at end of file +A Docker-based monitoring solution for detecting and auto-rebooting hung Kubernetes machines via Home Assistant integration. + +## Overview + +This watchdog monitors a target service URL for 502 Bad Gateway errors (indicating a hung machine). When a service fails: + +1. A 5-minute grace period begins (allowing for deployment recoveries) +2. If the service recovers within 5 minutes, the error is cleared (normal deployment scenario) +3. If still failing after 5 minutes, an automatic power-cycle is triggered via Home Assistant +4. The machine powers off for 10 seconds, then powers back on + +All activity is logged with timestamps for monitoring and troubleshooting. + +## Prerequisites + +- Docker and Docker Compose installed +- Home Assistant instance running with network access +- A power switch entity configured in Home Assistant +- Long-lived access token from Home Assistant + +## Installation + +### 1. Download/Organize Files + +Clone or download this repository to your machine: + +```bash +git clone +cd Thinkcentre-watchdog +``` + +The directory should contain: +- `Dockerfile` - Container definition +- `thinkcenter_monitor.sh` - Monitoring script +- `docker-compose.yml` - Docker Compose configuration +- `.env.example` - Environment variable template +- `README.md` - This file + +### 2. Create Configuration File + +Copy the example environment file and edit it with your actual values: + +```bash +cp .env.example .env +``` + +Edit `.env` and configure: + +```bash +# Your target service URL +TARGET_URL=http://your-kubernetes-service:8080 + +# Home Assistant configuration +HA_URL=http://homeassistant:8123 +HA_TOKEN=your_long_lived_access_token_here +HA_ENTITY=switch.your_power_switch_entity + +# Optional: Adjust timing if needed +GRACE_PERIOD=300 # 5 minutes +CHECK_INTERVAL=30 # Check every 30 seconds +``` + +### 3. Generate Home Assistant Token + +1. Open Home Assistant web interface +2. Go to **Settings** → **Developer Tools** → **Long-Lived Access Tokens** +3. Click **Create Token** +4. Name it (e.g., "Thinkcentre Watchdog") +5. Copy the token and paste it in your `.env` file as `HA_TOKEN` + +### 4. Configure Power Switch in Home Assistant + +Ensure you have a switch entity in Home Assistant that controls the machine's power. Common options: + +- **Smart Outlet/Relay**: If using a smart power outlet +- **IPMI/Redfish**: For datacenter machines +- **Smart Plug**: Like Tasmota, Zigbee, or Z-Wave devices + +Configure the entity ID in your `.env` as `HA_ENTITY` (e.g., `switch.thinkcentre_power`) + +### 5. Build and Run + +Start the monitoring container: + +```bash +docker compose up -d +``` + +The container will: +- Build from the Dockerfile +- Start with `restart: unless-stopped` policy +- Mount logs to a named volume +- Apply resource limits (0.1 CPU, 64MB memory) + +### 6. View Logs + +Monitor real-time logs: + +```bash +docker compose logs -f thinkcenter-monitor +``` + +Or view persistent logs from the volume: + +```bash +docker volume inspect thinkcenter_logs +# Look at the Mountpoint directory +``` + +### 7. Stop or Restart + +Stop the container: + +```bash +docker compose down +``` + +Restart the container: + +```bash +docker compose restart thinkcenter-monitor +``` + +## Deploying Multiple Instances + +To monitor multiple machines: + +### For Machine 2: + +Create a separate directory: + +```bash +mkdir thinkcentre-watchdog-machine2 +cd thinkcentre-watchdog-machine2 + +# Copy files +cp /path/to/original/* . + +# Create unique .env +cp .env.example .env + +# Edit .env for machine 2 +nano .env +# Change: HA_ENTITY=switch.machine2_power +# Change: TARGET_URL to machine 2's service URL +``` + +Then run: + +```bash +docker compose up -d +``` + +### Using Namespace (Alternative) + +Or manage from one directory with unique service names: + +```bash +docker compose -f docker-compose.yml -f docker-compose.machine2.yml up -d +``` + +## Configuration Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `TARGET_URL` | `http://localhost:8080` | Service URL to monitor | +| `HA_URL` | `http://homeassistant:8123` | Home Assistant base URL | +| `HA_TOKEN` | (required) | Home Assistant long-lived access token | +| `HA_ENTITY` | `switch.thinkcentre_power` | Home Assistant switch entity ID | +| `GRACE_PERIOD` | `300` | Seconds to wait before power-cycling (5 minutes) | +| `CHECK_INTERVAL` | `30` | Seconds between health checks | + +## Troubleshooting + +### Container won't start + +Check if `HA_TOKEN` is set: +```bash +docker compose config | grep HA_TOKEN +``` + +### No logs appearing + +Check the volume mount: +```bash +docker volume ls | grep thinkcenter_logs +docker volume inspect thinkcenter_logs +``` + +### Power-cycle not triggering + +1. Verify HA_TOKEN is valid (check Home Assistant logs) +2. Confirm HA_ENTITY exists in Home Assistant +3. Check network connectivity: `docker compose exec thinkcenter-monitor curl -v http://homeassistant:8123` + +### Service not responding correctly + +Test the target URL directly: +```bash +docker compose exec thinkcenter-monitor curl -v http://your-service:8080 +``` + +## How It Works + +1. **Health Check**: Every `CHECK_INTERVAL` seconds, HTTP response code is checked +2. **Grace Period**: First 502 error triggers a 5-minute window for recovery +3. **Recovery Detection**: If service returns non-502 during grace period, error resets +4. **Power Cycle**: After grace period expires with continued 502s, power cycle triggers: + - Send turn_off to HA switch entity + - Wait 10 seconds + - Send turn_on to HA switch entity +5. **Logging**: All events timestamped and logged to `/var/log/thinkcenter_monitor.log` + +## Resource Limits + +- CPU: 0.1 cores (limited to prevent resource hogging) +- Memory: 64MB (minimal requirements for bash + curl) +- Logging: JSON file driver, max 10MB per file, keeps 3 files (30MB total) + +## Debugging + +Enable verbose output by checking logs with: + +```bash +docker compose logs --tail 50 thinkcenter-monitor +``` + +To test the script locally (without Docker): + +```bash +bash thinkcenter_monitor.sh +``` + +## License + +Monitoring solution for Thinkcentre machines. + +## Support + +For issues or improvements, check the logs first and verify all environment variables are correctly set in your `.env` file. diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..241f4ff --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,35 @@ +version: '3.8' + +services: + thinkcenter-monitor: + build: + context: . + dockerfile: Dockerfile + container_name: thinkcenter-monitor + restart: unless-stopped + + environment: + TARGET_URL: ${TARGET_URL} + HA_URL: ${HA_URL} + HA_TOKEN: ${HA_TOKEN} + HA_ENTITY: ${HA_ENTITY} + GRACE_PERIOD: ${GRACE_PERIOD} + CHECK_INTERVAL: ${CHECK_INTERVAL} + LOG_FILE: /var/log/thinkcenter_monitor.log + + volumes: + - ./logs:/var/log + + # Resource limits + deploy: + resources: + limits: + cpus: '0.1' + memory: 64M + + # JSON file logging + logging: + driver: json-file + options: + max-file: '3' + max-size: '10m' diff --git a/pandora_monitor.sh b/pandora_monitor.sh deleted file mode 100644 index bd363f2..0000000 --- a/pandora_monitor.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/bash -# Monitoring script for pandora machine -# Checks if vorgabenportal.knowyoursecurity.com returns 502 Bad Gateway (indicates pandora hung) -# If 502 detected for >5 minutes, power-cycles the machine via Home Assistant - -# Configuration -TARGET_URL="https://vorgabenportal.knowyoursecurity.com" -HA_URL="http://homeassistant.local:8123" # Home Assistant URL -HA_TOKEN="YOUR_LONG_LIVED_ACCESS_TOKEN" # Get from HA: Profile > Long-Lived Access Tokens -HA_ENTITY="switch.pandora_power" # Entity ID of the smart switch/outlet -LOG_FILE="/var/log/pandora_monitor.log" -MAX_FAILURES=2 # Reboot after 2 consecutive failures -FAILURE_COUNT=0 -FAILURE_START_TIME=0 -GRACE_PERIOD=300 # 5 minutes grace for deployments -CHECK_INTERVAL=60 # Check every 1 minute (more granular during grace period) - -# Function to log messages -log_message() { - echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" -} - -# Function to check if machine is responsive -# If pandora hangs, the web service behind it returns 502 Bad Gateway -# Returns the HTTP code for caller to evaluate -check_responsive() { - local http_code=$(curl -s -o /dev/null -w "%{http_code}" \ - --max-time 10 \ - --connect-timeout 5 \ - "$TARGET_URL" 2>/dev/null) - - echo "$http_code" -} - -# Function to power cycle via Home Assistant -power_cycle() { - log_message "⚠️ ALERT: Service unresponsive for $FAILURE_COUNT consecutive checks! Triggering power cycle..." - - # Turn off the switch - curl -s -X POST \ - -H "Authorization: Bearer $HA_TOKEN" \ - -H "Content-Type: application/json" \ - -d '{"entity_id":"'"$HA_ENTITY"'"}' \ - "$HA_URL/api/services/switch/turn_off" &>/dev/null - - log_message "Power OFF command sent to $HA_ENTITY" - sleep 10 # Wait 10 seconds for machine to shut down - - # Turn on the switch - curl -s -X POST \ - -H "Authorization: Bearer $HA_TOKEN" \ - -H "Content-Type: application/json" \ - -d '{"entity_id":"'"$HA_ENTITY"'"}' \ - "$HA_URL/api/services/switch/turn_on" &>/dev/null - - log_message "Power ON command sent to $HA_ENTITY - Pandora rebooting" -} - -# Main monitoring loop -log_message "========================================" -log_message "Starting pandora monitoring service" -log_message "Monitoring: $TARGET_URL" -log_message "HA Entity: $HA_ENTITY" -log_message "Check interval: ${CHECK_INTERVAL}s" -log_message "Grace period: ${GRACE_PERIOD}s" -log_message "========================================" - -while true; do - http_code=$(check_responsive) - current_time=$(date +%s) - - # Check if service is OK (not 502 and not timeout) - if [ "$http_code" != "502" ] && [ "$http_code" != "000" ]; then - # Service is healthy - if [ $FAILURE_COUNT -gt 0 ]; then - log_message "✓ Service recovered! (was down $FAILURE_COUNT times)" - fi - FAILURE_COUNT=0 - FAILURE_START_TIME=0 - log_message "✓ HTTP $http_code from $TARGET_URL (healthy)" - else - # Service returned 502 or timeout - if [ $FAILURE_COUNT -eq 0 ]; then - # First failure - start grace period - FAILURE_START_TIME=$current_time - FAILURE_COUNT=1 - log_message "⚠️ First 502/timeout detected (HTTP $http_code) - Starting 5 minute grace period" - else - # Already in failure state - FAILURE_COUNT=$((FAILURE_COUNT + 1)) - elapsed=$((current_time - FAILURE_START_TIME)) - remaining=$((GRACE_PERIOD - elapsed)) - - if [ $remaining -gt 0 ]; then - log_message "⚠️ Still seeing 502 (HTTP $http_code) - Grace period: ${remaining}s remaining" - else - # Grace period expired - trigger reboot - log_message "🔴 CRITICAL: Service unresponsive for >5 minutes! Triggering power cycle..." - power_cycle - FAILURE_COUNT=0 - FAILURE_START_TIME=0 - sleep 120 # Wait 2 minutes after reboot attempt before checking again - fi - fi - fi - - sleep "$CHECK_INTERVAL" -done diff --git a/thinkcenter_monitor.sh b/thinkcenter_monitor.sh new file mode 100644 index 0000000..ecb07a4 --- /dev/null +++ b/thinkcenter_monitor.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +# Thinkcentre Monitor - Kubernetes Machine Health Monitor with Home Assistant Integration +# Detects hung machines (502 errors) and auto-reboots them via Home Assistant + +# Configuration from environment variables with sensible defaults +TARGET_URL="${TARGET_URL:-http://localhost:8080}" +HA_URL="${HA_URL:-http://homeassistant:8123}" +HA_TOKEN="${HA_TOKEN}" +HA_ENTITY="${HA_ENTITY:-switch.thinkcentre_power}" +LOG_FILE="${LOG_FILE:-/var/log/thinkcenter_monitor.log}" +GRACE_PERIOD="${GRACE_PERIOD:-300}" # 5 minutes in seconds +CHECK_INTERVAL="${CHECK_INTERVAL:-30}" # 30 seconds between checks + +# State variables +ERROR_START_TIME="" +IN_GRACE_PERIOD=false +LAST_RESPONSE_CODE="" + +# Validate required configuration +if [[ -z "$HA_TOKEN" ]]; then + echo "ERROR: HA_TOKEN environment variable is not set. Exiting." | tee -a "$LOG_FILE" + exit 1 +fi + +# Initialize log file +mkdir -p "$(dirname "$LOG_FILE")" +touch "$LOG_FILE" + +log() { + local message="$1" + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "[$timestamp] $message" | tee -a "$LOG_FILE" +} + +check_target() { + # Use curl to get HTTP response code without following redirects + local response_code=$(curl -s -o /dev/null -w "%{http_code}" -m 5 "$TARGET_URL" 2>/dev/null) + echo "$response_code" +} + +trigger_power_cycle() { + local entity="$1" + + log "ALERT: Triggering power cycle for entity: $entity" + + # Turn off + log "Sending turn_off request to Home Assistant..." + curl -s -X POST \ + -H "Authorization: Bearer $HA_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{\"entity_id\": \"$entity\"}" \ + "$HA_URL/api/services/switch/turn_off" > /dev/null 2>&1 + + if [[ $? -eq 0 ]]; then + log "Turn off request sent successfully" + else + log "ERROR: Failed to send turn_off request" + fi + + # Wait 10 seconds + log "Waiting 10 seconds before power-on..." + sleep 10 + + # Turn on + log "Sending turn_on request to Home Assistant..." + curl -s -X POST \ + -H "Authorization: Bearer $HA_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{\"entity_id\": \"$entity\"}" \ + "$HA_URL/api/services/switch/turn_on" > /dev/null 2>&1 + + if [[ $? -eq 0 ]]; then + log "Turn on request sent successfully" + log "Power cycle completed for $entity" + else + log "ERROR: Failed to send turn_on request" + fi + + # Reset state + ERROR_START_TIME="" + IN_GRACE_PERIOD=false +} + +log "=== Thinkcentre Monitor Started ===" +log "Configuration:" +log " TARGET_URL: $TARGET_URL" +log " HA_URL: $HA_URL" +log " HA_ENTITY: $HA_ENTITY" +log " GRACE_PERIOD: ${GRACE_PERIOD}s ($(( GRACE_PERIOD / 60 )) minutes)" +log " CHECK_INTERVAL: ${CHECK_INTERVAL}s" + +# Main monitoring loop +while true; do + RESPONSE_CODE=$(check_target) + + # Log all responses + log "HTTP Response: $RESPONSE_CODE" + + if [[ "$RESPONSE_CODE" == "502" ]]; then + # 502 Bad Gateway error detected + + if [[ -z "$ERROR_START_TIME" ]]; then + # First 502 error - start grace period + ERROR_START_TIME=$(date +%s) + IN_GRACE_PERIOD=true + log "502 error detected - starting 5-minute grace period (recovery window for deployment scenarios)" + else + # Already in grace period - check if it has expired + CURRENT_TIME=$(date +%s) + ELAPSED=$((CURRENT_TIME - ERROR_START_TIME)) + + if [[ $ELAPSED -ge $GRACE_PERIOD ]]; then + # Grace period expired - trigger power cycle + log "Grace period expired after ${ELAPSED}s. Service still unavailable." + trigger_power_cycle "$HA_ENTITY" + ERROR_START_TIME="" + IN_GRACE_PERIOD=false + else + # Still within grace period + REMAINING=$((GRACE_PERIOD - ELAPSED)) + log "Still in grace period. Service recovery window: ${REMAINING}s remaining" + fi + fi + else + # Service is responding (any code other than 502) + if [[ "$IN_GRACE_PERIOD" == true ]]; then + # Service recovered during grace period + CURRENT_TIME=$(date +%s) + RECOVERY_TIME=$((CURRENT_TIME - ERROR_START_TIME)) + log "Service recovered during grace period after ${RECOVERY_TIME}s. Resetting error state." + ERROR_START_TIME="" + IN_GRACE_PERIOD=false + elif [[ "$RESPONSE_CODE" != "000" ]]; then + # Service is healthy (not a timeout) + if [[ -n "$LAST_RESPONSE_CODE" && "$LAST_RESPONSE_CODE" != "$RESPONSE_CODE" ]]; then + log "Service status changed from $LAST_RESPONSE_CODE to $RESPONSE_CODE" + fi + fi + fi + + LAST_RESPONSE_CODE="$RESPONSE_CODE" + + # Wait for next check + sleep "$CHECK_INTERVAL" +done