Heartbeat und Überwachung eingeführt
This commit is contained in:
142
monitor.py
Normal file
142
monitor.py
Normal file
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Monitoring-Script für CameraSave
|
||||
Prüft, ob das Programm regelmäßig läuft und sendet E-Mail-Benachrichtigungen bei Problemen.
|
||||
"""
|
||||
|
||||
import os
|
||||
import smtplib
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from email.mime.text import MIMEText
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
|
||||
# === KONFIGURATION ===
|
||||
HEARTBEAT_FILE = os.getenv("HEARTBEAT_FILE", "/app/heartbeat.txt")
|
||||
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "600")) # 10 Minuten
|
||||
MAX_AGE = int(os.getenv("MAX_AGE", "900")) # 15 Minuten - danach Alarm
|
||||
|
||||
# E-Mail Konfiguration für Benachrichtigungen
|
||||
SMTP_SERVER = os.getenv("SMTP_SERVER", "securesmtp.t-online.de")
|
||||
SMTP_PORT = int(os.getenv("SMTP_PORT", "587"))
|
||||
SMTP_USER = os.getenv("SMTP_USER", os.getenv("EMAIL_USER", ""))
|
||||
SMTP_PASS = os.getenv("SMTP_PASS", os.getenv("EMAIL_PASS", ""))
|
||||
ALERT_EMAIL = os.getenv("ALERT_EMAIL", SMTP_USER) # Standard: an sich selbst
|
||||
|
||||
last_alert_time = None
|
||||
ALERT_COOLDOWN = int(os.getenv("ALERT_COOLDOWN", "3600")) # 1 Stunde zwischen Alarmen
|
||||
|
||||
|
||||
def send_alert_email(subject, message):
|
||||
"""Sendet eine E-Mail-Benachrichtigung."""
|
||||
global last_alert_time
|
||||
|
||||
# Cooldown prüfen - nicht zu oft E-Mails senden
|
||||
if last_alert_time:
|
||||
time_since_last = (datetime.now() - last_alert_time).total_seconds()
|
||||
if time_since_last < ALERT_COOLDOWN:
|
||||
print(f"⏳ Cooldown aktiv. Nächste E-Mail möglich in {int(ALERT_COOLDOWN - time_since_last)}s")
|
||||
return
|
||||
|
||||
try:
|
||||
msg = MIMEMultipart()
|
||||
msg['From'] = SMTP_USER
|
||||
msg['To'] = ALERT_EMAIL
|
||||
msg['Subject'] = subject
|
||||
|
||||
body = f"""
|
||||
CameraSave Monitoring Alert
|
||||
==========================
|
||||
|
||||
{message}
|
||||
|
||||
Zeit: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
||||
Server: {os.uname().nodename if hasattr(os, 'uname') else 'Unknown'}
|
||||
|
||||
Diese Nachricht wurde automatisch vom Monitoring-System generiert.
|
||||
"""
|
||||
msg.attach(MIMEText(body, 'plain'))
|
||||
|
||||
server = smtplib.SMTP(SMTP_SERVER, SMTP_PORT)
|
||||
server.starttls()
|
||||
server.login(SMTP_USER, SMTP_PASS)
|
||||
server.send_message(msg)
|
||||
server.quit()
|
||||
|
||||
last_alert_time = datetime.now()
|
||||
print(f"✅ Alert-E-Mail gesendet an {ALERT_EMAIL}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Fehler beim E-Mail-Versand: {e}")
|
||||
|
||||
|
||||
def check_heartbeat():
|
||||
"""Prüft, ob die Heartbeat-Datei aktuell ist."""
|
||||
if not os.path.exists(HEARTBEAT_FILE):
|
||||
return False, "Heartbeat-Datei existiert nicht"
|
||||
|
||||
try:
|
||||
mtime = os.path.getmtime(HEARTBEAT_FILE)
|
||||
age = time.time() - mtime
|
||||
|
||||
if age > MAX_AGE:
|
||||
return False, f"Heartbeat zu alt: {int(age)}s (max {MAX_AGE}s)"
|
||||
|
||||
return True, f"OK - Heartbeat vor {int(age)}s"
|
||||
|
||||
except Exception as e:
|
||||
return False, f"Fehler beim Lesen der Heartbeat-Datei: {e}"
|
||||
|
||||
|
||||
def send_recovery_email():
|
||||
"""Sendet eine E-Mail, wenn das System sich erholt hat."""
|
||||
send_alert_email(
|
||||
"✅ CameraSave: System wiederhergestellt",
|
||||
"Das CameraSave-Programm läuft wieder normal."
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
"""Hauptschleife des Monitoring."""
|
||||
print("🔍 CameraSave Monitor gestartet")
|
||||
print(f" Heartbeat-Datei: {HEARTBEAT_FILE}")
|
||||
print(f" Prüfintervall: {CHECK_INTERVAL}s")
|
||||
print(f" Max Alter: {MAX_AGE}s")
|
||||
print(f" Alert E-Mail: {ALERT_EMAIL}")
|
||||
print()
|
||||
|
||||
was_down = False
|
||||
|
||||
while True:
|
||||
try:
|
||||
is_healthy, status = check_heartbeat()
|
||||
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
if is_healthy:
|
||||
print(f"[{timestamp}] ✅ {status}")
|
||||
if was_down:
|
||||
# System ist wieder online
|
||||
send_recovery_email()
|
||||
was_down = False
|
||||
else:
|
||||
print(f"[{timestamp}] ❌ {status}")
|
||||
if not was_down:
|
||||
# Erster Fehler - Alarm senden
|
||||
send_alert_email(
|
||||
"⚠️ CameraSave: Programm läuft nicht mehr!",
|
||||
f"Das CameraSave-Programm antwortet nicht mehr.\n\nStatus: {status}"
|
||||
)
|
||||
was_down = True
|
||||
|
||||
time.sleep(CHECK_INTERVAL)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n🛑 Monitor gestoppt")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"❌ Unerwarteter Fehler: {e}")
|
||||
time.sleep(CHECK_INTERVAL)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user