lainmonitor/lainmonitor.py

#!/usr/bin/env python3

# --------------------------------------------------------------------------
# Description: A Telegram bot for monitoring critical infrastructur services
# Dependencies: telebot
# Usage: python3 lainmonitor.py | or run it as a service
# Author: h@x
# Version: 2.1.0
# --------------------------------------------------------------------------

import subprocess
import telebot
import paramiko
import requests
import time
import socket
import logging
import ssl
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from telebot import types
import config

# Configure logging
tlogging_format = "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
logging.basicConfig(level=logging.INFO, format=tlogging_format)
logger = logging.getLogger(__name__)

# Ensure certificate directory exists
CERT_DIR = os.path.join(os.path.dirname(__file__), 'certs')
if not os.path.isdir(CERT_DIR):
    os.makedirs(CERT_DIR, exist_ok=True)

bot = telebot.TeleBot(config.TOKEN)
ALLOWED_CHATS = set(config.ALLOWED_CHATS)

# Utility for command execution with timeout
def run_cmd(cmd, timeout=5):
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
        return result.stdout.strip()
    except subprocess.TimeoutExpired as e:
        logger.warning(f"Command {cmd} timed out: {e}")
        return 'timeout'
    except OSError as e:
        logger.error(f"OS error running {cmd}: {e}")
        return 'error'

# Local system info
def get_local_info():
    hostname = run_cmd(['hostname'])
    uptime = run_cmd(['uptime', '-p'])
    load_line = run_cmd(['uptime'])
    load_avg = load_line.split('load average:')[-1].strip() if 'load average:' in load_line else 'unknown'
    memory = run_cmd(['free', '-h'])
    disk = run_cmd(['df', '-h'])
    status = 'online' if hostname not in ('', 'error', 'timeout') else 'offline'
    return {'hostname': hostname, 'uptime': uptime, 'load_avg': load_avg, 'memory': memory, 'disk': disk, 'status': status}

# Fetch and store SSL certificate once
def fetch_certificate(host, port):
    cert_path = os.path.join(CERT_DIR, f"{host}.pem")
    if os.path.isfile(cert_path):
        return cert_path
    try:
        cert = ssl.get_server_certificate((host, port))
        with open(cert_path, 'w') as f:
            f.write(cert)
        logger.info(f"Saved certificate for {host} to {cert_path}")
        return cert_path
    except Exception as e:
        logger.error(f"Failed to fetch certificate for {host}: {e}")
        return True

# SSH-based info gathering
def get_ssh_info(ip, cfg):
    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    try:
        client.connect(ip, username=cfg['ssh_user'], password=cfg['ssh_pass'], timeout=5)
        info = {}
        cmds = {'hostname': 'hostname', 'uptime': 'uptime -p', 'load_avg': 'uptime', 'memory': 'free -h', 'disk': 'df -h'}
        for key, cmd in cmds.items():
            try:
                stdin, stdout, stderr = client.exec_command(cmd, timeout=5)
                out = stdout.read().decode().strip()
                if key == 'load_avg' and 'load average:' in out:
                    out = out.split('load average:')[-1].strip()
                info[key] = out
            except (socket.timeout, paramiko.SSHException) as e:
                logger.error(f"SSH command {cmd} on {ip} failed: {e}")
                info[key] = 'error'
        info['status'] = 'online'
    except (paramiko.AuthenticationException, paramiko.SSHException, socket.timeout) as e:
        logger.error(f"SSH connection to {ip} failed: {e}")
        info = {'status': 'unreachable'}
    finally:
        try: client.close()
        except Exception as e: logger.warning(f"Error closing SSH to {ip}: {e}")
    return ip, info

# OPNsense API-based info gathering
def get_opnsense_info(ip, cfg):
    url = cfg['api_url']
    host = url.split('//')[1].split('/')[0].split(':')[0]
    port = int(url.split('//')[1].split('/')[0].split(':')[1]) if ':' in url.split('//')[1].split('/')[0] else 443
    verify = fetch_certificate(host, port)
    try:
        resp = requests.get(f"{url}/core/get/health", auth=(cfg['api_key'], cfg['api_secret']), verify=verify, timeout=5)
        resp.raise_for_status()
        data = resp.json().get('health', {})
        return ip, {'status': data.get('health','unknown'), 'uptime': data.get('uptime','unknown'), 'memory': f"{data.get('mem_used','?')}MB/{data.get('mem_total','?')}MB", 'load_avg': data.get('load_avg','unknown'), 'disk': f"{data.get('disk_used','?')}%/{data.get('disk_total','?')}%"}
    except requests.RequestException as e:
        logger.error(f"OPNsense API call for {ip} failed: {e}")
        return ip, {'status': 'unreachable'}

# Gather info for given host or all hosts
def gather_host(ip=None):
    if ip and ip in config.HOSTS:
        cfg = config.HOSTS[ip]
        return [get_ssh_info(ip, cfg) if cfg['type']=='generic' else get_opnsense_info(ip, cfg)]
    # all hosts
    return gather_clients()

# Ping utility
def ping_ip(ip):
    res = run_cmd(['ping', '-c', '1', ip], timeout=3)
    if '1 packets transmitted, 1 received' in res or '1 packets transmitted, 1 packets received' in res:
        return 'reachable'
    if res in ('timeout', 'error'):
        return res
    return 'unreachable'

# Access control decorator
def restricted(func):
    def wrapper(msg, *args, **kwargs):
        if msg.chat.id not in ALLOWED_CHATS:
            bot.reply_to(msg, 'Unauthorized access')
            return
        return func(msg, *args, **kwargs)
    return wrapper

# /status: show menu of available hosts
@bot.message_handler(commands=['status'])
@restricted
def handle_status(msg):
    keyboard = types.InlineKeyboardMarkup()
    for ip in config.HOSTS.keys():
        keyboard.add(types.InlineKeyboardButton(ip, callback_data=f'status:{ip}'))
    keyboard.add(types.InlineKeyboardButton('All', callback_data='status:all'))
    bot.send_message(msg.chat.id, 'Select host for status:', reply_markup=keyboard)

# Callback handler for inline menu
@bot.callback_query_handler(func=lambda c: c.data.startswith('status:'))
@restricted
def callback_status(call):
    _, key = call.data.split(':', 1)
    if key == 'all':
        entries = gather_clients()
    else:
        entries = dict(gather_host(key))
    lines = []
    for ip, info in entries.items():
        lines.append(f"{ip}: {info.get('status','unknown')}")
        if info.get('status')=='online':
            for field in ('uptime','load_avg','memory','disk'):
                lines.append(f"  {field}: {info.get(field,'-')}")
    bot.send_message(call.message.chat.id, '\n'.join(lines))

# /ping <IP>
@bot.message_handler(func=lambda m: m.text and m.text.startswith('/ping'))
@restricted
def handle_ping(msg):
    parts = msg.text.split()
    if len(parts) != 2:
        bot.reply_to(msg, 'Usage: /ping <IP>')
        return
    ip = parts[1]
    status = ping_ip(ip)
    bot.reply_to(msg, f"Ping {ip}: {status}")

# Run polling with retry
while True:
    try:
        bot.polling()
    except Exception as e:
        logger.error(f"Polling error: {e}")
        time.sleep(5)