Updoot secure system monitor with service management

Key Improvements:

Error Handling:
Used try-except blocks to catch errors from subprocesses and file operations, logging issues.

Thread Safety:
Introduced queue.Queue for thread-safe operations when handling ping results.

Subprocess Optimizations:
Used subprocess.run() for cleaner, more modern handling of subprocesses.
Avoided shell=True for security reasons unless absolutely necessary.

Service Management:
Improved service status checking by using systemctl is-active and using exit statuses for reliability.

User Authorization:
Checked user authorization in relevant commands like /restart, /reboot, and /ping.

Logging:
Introduced logging for all major operations to track activity and errors.

Polling Timeout:
Added timeouts and error handling to prevent the bot from hanging during long polling.

This updoot is bring more security, robustness, and scalability, ready to handle various edge cases that might occur in our system monitoring.

Signed-off-by: hax <hax@lainlounge.xyz>
This commit is contained in:
h@x 2024-10-25 00:29:37 +00:00
parent ae07a3a86d
commit b6792931dc

View file

@ -1,75 +1,191 @@
#description: telegram bot for monitoring the system
#dependencies: telebot
#usage: python3 lainmonitor.py | or run it as a service
#author: hornetmaidan
# --/usr/bin/env python3 -- #
# description: telegram bot for monitoring the system
# dependencies: telebot
# usage: python3 lainmonitor.py | or run it as a service
# author: hornetmaidan
# contributors: h@x
# version: 1.1.6
import os
import subprocess
import threading
import queue
from time import sleep
import telebot
#define the variables
status, hostname, uptime, zerotier, prosody, postgres, tailscale, disk, ping = 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown'
#telegram bot token
TOKEN = 'PLACE_YOUR_TOKEN_HERE'
import logging
#bot init
bot = telebot.TeleBot(TOKEN)
# Setup logging
logging.basicConfig(filename='lainmonitor.log', level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
#get system info
def getinfo():
global status, hostname, uptime, zerotier, prosody, postgres, tailscale, disk
hostname = subprocess.check_output(['hostname']).decode().strip()
uptime = subprocess.check_output(['uptime', '-p']).decode().strip()
#systemd-only services
zerotier = subprocess.Popen("sudo systemctl status zerotier-one | grep 'Active'", shell=True, stdout=subprocess.PIPE).stdout.read().decode().strip()
prosody = subprocess.Popen("sudo systemctl status prosody | grep 'Active'", shell=True, stdout=subprocess.PIPE).stdout.read().decode().strip()
postgres = subprocess.Popen("sudo systemctl status postgresql | grep 'Active'", shell=True, stdout=subprocess.PIPE).stdout.read().decode().strip()
tailscale = subprocess.Popen("sudo systemctl status tailscaled | grep 'Active'", shell=True, stdout=subprocess.PIPE).stdout.read().decode().strip()
disk = subprocess.check_output(['df', '-h']).decode().strip()
if hostname == 'unknown':
# Load environment variables and config files securely
script_dir = os.path.dirname(os.path.realpath(__file__))
env_path = os.path.join(script_dir, '.env')
auth_users_path = os.path.join(script_dir, '.authorized_users')
# Load the token
try:
with open(env_path, 'r') as f:
token = f.read().strip()
except FileNotFoundError:
logging.error('Token file not found. Exiting...')
exit(1)
# Load the authorized users
try:
authorized_users = [line.strip() for line in open(auth_users_path, 'r').readlines()]
except FileNotFoundError:
logging.error('Authorized users file not found. Exiting...')
exit(1)
# Initialize the bot
bot = telebot.TeleBot(token)
# Define status variables
status, hostname, uptime = 'unknown', 'unknown', 'unknown'
zerotier, prosody, postgres, tailscale, nginx, disk = ['unknown'] * 6
nodes, hostnames, threads = [], [], []
reach_queue = queue.Queue()
# Get basic system info
def get_system_info():
global hostname, uptime, zerotier, prosody, postgres, tailscale, nginx, disk
try:
hostname = subprocess.check_output(['hostname']).decode().strip()
uptime = subprocess.check_output(['uptime', '-p']).decode().strip()
services = ['zerotier-one', 'prosody', 'postgresql', 'tailscaled', 'nginx']
status_results = []
for service in services:
status_results.append(get_service_status(service))
zerotier, prosody, postgres, tailscale, nginx = status_results
disk = subprocess.check_output(['df', '-h']).decode().strip()
except subprocess.CalledProcessError as e:
logging.error(f"Error fetching system info: {e}")
status = 'offline'
else:
status = 'online'
return hostname, uptime, zerotier, prosody, postgres, tailscale, disk
#ping tailscale (change the IP address to the one you want or add more)
def check_tailscale():
global ping
ping = subprocess.Popen("ping TAILSCALE_IP -c 1 | grep '1 packets'", shell=True, stdout=subprocess.PIPE).stdout.read().decode().strip()
if '1 received' in ping:
ping = 'connected'
# Helper function to get service status
def get_service_status(service):
try:
subprocess.run(['sudo', 'systemctl', 'is-active', '--quiet', service], check=True)
return f'{service} is active'
except subprocess.CalledProcessError:
return f'{service} is inactive'
# Function to ping a Tailscale node
def ping_node(node, hostname):
try:
ping = subprocess.run(['ping', '-c', '1', node], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
reach_queue.put(f'{node}/{hostname} is reachable')
except subprocess.CalledProcessError:
reach_queue.put(f'{node}/{hostname} is unreachable')
# Check Tailscale nodes
def check_tailscale_nodes():
global nodes, hostnames, threads
try:
nodes_output = subprocess.check_output("tailscale status | grep '100'", shell=True).decode().strip()
nodes = [line.split()[0] for line in nodes_output.split('\n') if line]
hostnames = [line.split()[1] for line in nodes_output.split('\n') if line]
for node, hostname in zip(nodes, hostnames):
thread = threading.Thread(target=ping_node, args=(node, hostname))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
reach = []
while not reach_queue.empty():
reach.append(reach_queue.get())
return reach
except subprocess.CalledProcessError as e:
logging.error(f"Error checking Tailscale status: {e}")
return ['Error checking Tailscale status']
# Function to restart a service
def restart_service(service):
logging.info(f'Restarting {service}...')
try:
subprocess.run(['sudo', 'systemctl', 'restart', service], check=True)
sleep(3)
service_status = get_service_status(service)
status_message = f'{service} restarted! Status: {service_status}'
logging.info(status_message)
return status_message
except subprocess.CalledProcessError as e:
logging.error(f"Error restarting {service}: {e}")
return f'Error restarting {service}'
# Restart services menu
def restart_menu():
keyboard = [
[telebot.types.InlineKeyboardButton('zerotier-one', callback_data='zerotier-one')],
[telebot.types.InlineKeyboardButton('prosody', callback_data='prosody')],
[telebot.types.InlineKeyboardButton('postgresql', callback_data='postgresql')],
[telebot.types.InlineKeyboardButton('tailscaled', callback_data='tailscaled')],
[telebot.types.InlineKeyboardButton('nginx', callback_data='nginx')],
[telebot.types.InlineKeyboardButton('cancel', callback_data='cancel')]
]
reply_markup = telebot.types.InlineKeyboardMarkup(keyboard)
return reply_markup
# Callback query handler for service restart
@bot.callback_query_handler(func=lambda call: True)
def callback_query(call):
service = call.data
if service != 'cancel':
status_message = restart_service(service)
bot.send_message(call.message.chat.id, status_message)
else:
ping = 'unreachable'
return ping
bot.edit_message_reply_markup(call.message.chat.id, call.message.message_id, reply_markup=None)
bot.send_message(call.message.chat.id, 'Canceled')
#debug handler
def check():
global status, hostname, uptime, zerotier, prosody, postgres, tailscale, disk
getinfo()
print('system status:', status)
print('hostname:', hostname)
print('uptime:', uptime)
print('zerotier:', zerotier)
print('prosody:', prosody)
print('postgres:', postgres)
print('tailscale:', tailscale)
print('disk:', disk)
return status, hostname, uptime, zerotier, prosody, postgres, tailscale, disk
# Reboot system function
def reboot():
logging.info('Rebooting system...')
subprocess.run(['sudo', 'reboot'], check=True)
#message handling
@bot.message_handler(commands=['start', 'help', 'status', 'reboot', 'ping'])
# Message handlers
@bot.message_handler(commands=['start', 'help', 'status', 'restart', 'reboot', 'ping'])
def handle(message):
user_id = str(message.from_user.id)
if message.text == '/start':
bot.reply_to(message, 'lainmonitor v1.0 --- standing by...')
elif message.text == '/help':
bot.reply_to(message, 'commands: /start, /help, /status, /reboot, /ping')
bot.reply_to(message, 'commands: /start, /help, /status, /restart, /reboot, /ping')
elif message.text == '/status':
check()
status_message = f'hostname: {hostname}\nsystem status: {status}\nuptime: {uptime}\nzerotier: {zerotier}\nprosody: {prosody}\npostgres: {postgres}\ntailscale: {tailscale}'
get_system_info()
status_message = (
f'hostname: {hostname}\n'
f'system status: {status}\n'
f'uptime: {uptime}\n'
f'zerotier: {zerotier}\n'
f'prosody: {prosody}\n'
f'postgres: {postgres}\n'
f'tailscale: {tailscale}\n'
f'nginx: {nginx}'
)
bot.reply_to(message, status_message)
bot.reply_to(message, f'filesystem info for {hostname}: \n\n{disk}')
elif message.text == '/reboot':
bot.reply_to(message, 'work in progress...')
elif message.text == '/ping':
check_tailscale()
bot.reply_to(message, f'ping status: {ping}')
bot.reply_to(message, f'Filesystem info for {hostname}:\n\n{disk}')
elif message.text == f'/restart {hostname}' and user_id in authorized_users:
bot.send_message(message.chat.id, 'Select a service to restart:', reply_markup=restart_menu())
elif message.text == f'/reboot {hostname}' and user_id in authorized_users:
bot.reply_to(message, f'Rebooting {hostname}...')
reboot()
elif message.text == '/ping' and user_id in authorized_users:
reach = check_tailscale_nodes()
bot.reply_to(message, f'Ping status:\n\n{"\n".join(reach)}')
else:
bot.reply_to(message, 'You are not authorized for this action')
# Polling with timeout and error handling
try:
bot.polling(none_stop=True, timeout=60, long_polling_timeout=60)
except Exception as e:
logging.error(f'Polling error: {e}')
#polling
bot.polling()