forked from hornet/lainmonitor
Updoot secure system monitor with service management
Key Improvements: Error Handling: Used try-except blocks to catch errors from subprocesses and file operations, logging issues. Thread Safety: Introduced queue.Queue for thread-safe operations when handling ping results. Subprocess Optimizations: Used subprocess.run() for cleaner, more modern handling of subprocesses. Avoided shell=True for security reasons unless absolutely necessary. Service Management: Improved service status checking by using systemctl is-active and using exit statuses for reliability. User Authorization: Checked user authorization in relevant commands like /restart, /reboot, and /ping. Logging: Introduced logging for all major operations to track activity and errors. Polling Timeout: Added timeouts and error handling to prevent the bot from hanging during long polling. This updoot is bring more security, robustness, and scalability, ready to handle various edge cases that might occur in our system monitoring. Signed-off-by: hax <hax@lainlounge.xyz>
This commit is contained in:
parent
ae07a3a86d
commit
b6792931dc
1 changed files with 173 additions and 57 deletions
230
lainmonitor.py
230
lainmonitor.py
|
|
@ -1,75 +1,191 @@
|
||||||
#description: telegram bot for monitoring the system
|
# --/usr/bin/env python3 -- #
|
||||||
#dependencies: telebot
|
# description: telegram bot for monitoring the system
|
||||||
#usage: python3 lainmonitor.py | or run it as a service
|
# dependencies: telebot
|
||||||
#author: hornetmaidan
|
# usage: python3 lainmonitor.py | or run it as a service
|
||||||
|
# author: hornetmaidan
|
||||||
|
# contributors: h@x
|
||||||
|
# version: 1.1.6
|
||||||
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import threading
|
||||||
|
import queue
|
||||||
|
from time import sleep
|
||||||
import telebot
|
import telebot
|
||||||
#define the variables
|
import logging
|
||||||
status, hostname, uptime, zerotier, prosody, postgres, tailscale, disk, ping = 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown', 'unknown'
|
|
||||||
#telegram bot token
|
|
||||||
TOKEN = 'PLACE_YOUR_TOKEN_HERE'
|
|
||||||
|
|
||||||
#bot init
|
# Setup logging
|
||||||
bot = telebot.TeleBot(TOKEN)
|
logging.basicConfig(filename='lainmonitor.log', level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
|
||||||
#get system info
|
# Load environment variables and config files securely
|
||||||
def getinfo():
|
script_dir = os.path.dirname(os.path.realpath(__file__))
|
||||||
global status, hostname, uptime, zerotier, prosody, postgres, tailscale, disk
|
env_path = os.path.join(script_dir, '.env')
|
||||||
hostname = subprocess.check_output(['hostname']).decode().strip()
|
auth_users_path = os.path.join(script_dir, '.authorized_users')
|
||||||
uptime = subprocess.check_output(['uptime', '-p']).decode().strip()
|
|
||||||
#systemd-only services
|
# Load the token
|
||||||
zerotier = subprocess.Popen("sudo systemctl status zerotier-one | grep 'Active'", shell=True, stdout=subprocess.PIPE).stdout.read().decode().strip()
|
try:
|
||||||
prosody = subprocess.Popen("sudo systemctl status prosody | grep 'Active'", shell=True, stdout=subprocess.PIPE).stdout.read().decode().strip()
|
with open(env_path, 'r') as f:
|
||||||
postgres = subprocess.Popen("sudo systemctl status postgresql | grep 'Active'", shell=True, stdout=subprocess.PIPE).stdout.read().decode().strip()
|
token = f.read().strip()
|
||||||
tailscale = subprocess.Popen("sudo systemctl status tailscaled | grep 'Active'", shell=True, stdout=subprocess.PIPE).stdout.read().decode().strip()
|
except FileNotFoundError:
|
||||||
disk = subprocess.check_output(['df', '-h']).decode().strip()
|
logging.error('Token file not found. Exiting...')
|
||||||
if hostname == 'unknown':
|
exit(1)
|
||||||
|
|
||||||
|
# Load the authorized users
|
||||||
|
try:
|
||||||
|
authorized_users = [line.strip() for line in open(auth_users_path, 'r').readlines()]
|
||||||
|
except FileNotFoundError:
|
||||||
|
logging.error('Authorized users file not found. Exiting...')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
# Initialize the bot
|
||||||
|
bot = telebot.TeleBot(token)
|
||||||
|
|
||||||
|
# Define status variables
|
||||||
|
status, hostname, uptime = 'unknown', 'unknown', 'unknown'
|
||||||
|
zerotier, prosody, postgres, tailscale, nginx, disk = ['unknown'] * 6
|
||||||
|
nodes, hostnames, threads = [], [], []
|
||||||
|
reach_queue = queue.Queue()
|
||||||
|
|
||||||
|
# Get basic system info
|
||||||
|
def get_system_info():
|
||||||
|
global hostname, uptime, zerotier, prosody, postgres, tailscale, nginx, disk
|
||||||
|
try:
|
||||||
|
hostname = subprocess.check_output(['hostname']).decode().strip()
|
||||||
|
uptime = subprocess.check_output(['uptime', '-p']).decode().strip()
|
||||||
|
|
||||||
|
services = ['zerotier-one', 'prosody', 'postgresql', 'tailscaled', 'nginx']
|
||||||
|
status_results = []
|
||||||
|
for service in services:
|
||||||
|
status_results.append(get_service_status(service))
|
||||||
|
zerotier, prosody, postgres, tailscale, nginx = status_results
|
||||||
|
|
||||||
|
disk = subprocess.check_output(['df', '-h']).decode().strip()
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
logging.error(f"Error fetching system info: {e}")
|
||||||
status = 'offline'
|
status = 'offline'
|
||||||
else:
|
else:
|
||||||
status = 'online'
|
status = 'online'
|
||||||
return hostname, uptime, zerotier, prosody, postgres, tailscale, disk
|
|
||||||
|
|
||||||
#ping tailscale (change the IP address to the one you want or add more)
|
# Helper function to get service status
|
||||||
def check_tailscale():
|
def get_service_status(service):
|
||||||
global ping
|
try:
|
||||||
ping = subprocess.Popen("ping TAILSCALE_IP -c 1 | grep '1 packets'", shell=True, stdout=subprocess.PIPE).stdout.read().decode().strip()
|
subprocess.run(['sudo', 'systemctl', 'is-active', '--quiet', service], check=True)
|
||||||
if '1 received' in ping:
|
return f'{service} is active'
|
||||||
ping = 'connected'
|
except subprocess.CalledProcessError:
|
||||||
|
return f'{service} is inactive'
|
||||||
|
|
||||||
|
# Function to ping a Tailscale node
|
||||||
|
def ping_node(node, hostname):
|
||||||
|
try:
|
||||||
|
ping = subprocess.run(['ping', '-c', '1', node], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
|
||||||
|
reach_queue.put(f'{node}/{hostname} is reachable')
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
reach_queue.put(f'{node}/{hostname} is unreachable')
|
||||||
|
|
||||||
|
# Check Tailscale nodes
|
||||||
|
def check_tailscale_nodes():
|
||||||
|
global nodes, hostnames, threads
|
||||||
|
try:
|
||||||
|
nodes_output = subprocess.check_output("tailscale status | grep '100'", shell=True).decode().strip()
|
||||||
|
nodes = [line.split()[0] for line in nodes_output.split('\n') if line]
|
||||||
|
hostnames = [line.split()[1] for line in nodes_output.split('\n') if line]
|
||||||
|
|
||||||
|
for node, hostname in zip(nodes, hostnames):
|
||||||
|
thread = threading.Thread(target=ping_node, args=(node, hostname))
|
||||||
|
threads.append(thread)
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
for thread in threads:
|
||||||
|
thread.join()
|
||||||
|
|
||||||
|
reach = []
|
||||||
|
while not reach_queue.empty():
|
||||||
|
reach.append(reach_queue.get())
|
||||||
|
|
||||||
|
return reach
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
logging.error(f"Error checking Tailscale status: {e}")
|
||||||
|
return ['Error checking Tailscale status']
|
||||||
|
|
||||||
|
# Function to restart a service
|
||||||
|
def restart_service(service):
|
||||||
|
logging.info(f'Restarting {service}...')
|
||||||
|
try:
|
||||||
|
subprocess.run(['sudo', 'systemctl', 'restart', service], check=True)
|
||||||
|
sleep(3)
|
||||||
|
service_status = get_service_status(service)
|
||||||
|
status_message = f'{service} restarted! Status: {service_status}'
|
||||||
|
logging.info(status_message)
|
||||||
|
return status_message
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
logging.error(f"Error restarting {service}: {e}")
|
||||||
|
return f'Error restarting {service}'
|
||||||
|
|
||||||
|
# Restart services menu
|
||||||
|
def restart_menu():
|
||||||
|
keyboard = [
|
||||||
|
[telebot.types.InlineKeyboardButton('zerotier-one', callback_data='zerotier-one')],
|
||||||
|
[telebot.types.InlineKeyboardButton('prosody', callback_data='prosody')],
|
||||||
|
[telebot.types.InlineKeyboardButton('postgresql', callback_data='postgresql')],
|
||||||
|
[telebot.types.InlineKeyboardButton('tailscaled', callback_data='tailscaled')],
|
||||||
|
[telebot.types.InlineKeyboardButton('nginx', callback_data='nginx')],
|
||||||
|
[telebot.types.InlineKeyboardButton('cancel', callback_data='cancel')]
|
||||||
|
]
|
||||||
|
reply_markup = telebot.types.InlineKeyboardMarkup(keyboard)
|
||||||
|
return reply_markup
|
||||||
|
|
||||||
|
# Callback query handler for service restart
|
||||||
|
@bot.callback_query_handler(func=lambda call: True)
|
||||||
|
def callback_query(call):
|
||||||
|
service = call.data
|
||||||
|
if service != 'cancel':
|
||||||
|
status_message = restart_service(service)
|
||||||
|
bot.send_message(call.message.chat.id, status_message)
|
||||||
else:
|
else:
|
||||||
ping = 'unreachable'
|
bot.edit_message_reply_markup(call.message.chat.id, call.message.message_id, reply_markup=None)
|
||||||
return ping
|
bot.send_message(call.message.chat.id, 'Canceled')
|
||||||
|
|
||||||
#debug handler
|
# Reboot system function
|
||||||
def check():
|
def reboot():
|
||||||
global status, hostname, uptime, zerotier, prosody, postgres, tailscale, disk
|
logging.info('Rebooting system...')
|
||||||
getinfo()
|
subprocess.run(['sudo', 'reboot'], check=True)
|
||||||
print('system status:', status)
|
|
||||||
print('hostname:', hostname)
|
|
||||||
print('uptime:', uptime)
|
|
||||||
print('zerotier:', zerotier)
|
|
||||||
print('prosody:', prosody)
|
|
||||||
print('postgres:', postgres)
|
|
||||||
print('tailscale:', tailscale)
|
|
||||||
print('disk:', disk)
|
|
||||||
return status, hostname, uptime, zerotier, prosody, postgres, tailscale, disk
|
|
||||||
|
|
||||||
#message handling
|
# Message handlers
|
||||||
@bot.message_handler(commands=['start', 'help', 'status', 'reboot', 'ping'])
|
@bot.message_handler(commands=['start', 'help', 'status', 'restart', 'reboot', 'ping'])
|
||||||
def handle(message):
|
def handle(message):
|
||||||
|
user_id = str(message.from_user.id)
|
||||||
if message.text == '/start':
|
if message.text == '/start':
|
||||||
bot.reply_to(message, 'lainmonitor v1.0 --- standing by...')
|
bot.reply_to(message, 'lainmonitor v1.0 --- standing by...')
|
||||||
elif message.text == '/help':
|
elif message.text == '/help':
|
||||||
bot.reply_to(message, 'commands: /start, /help, /status, /reboot, /ping')
|
bot.reply_to(message, 'commands: /start, /help, /status, /restart, /reboot, /ping')
|
||||||
elif message.text == '/status':
|
elif message.text == '/status':
|
||||||
check()
|
get_system_info()
|
||||||
status_message = f'hostname: {hostname}\nsystem status: {status}\nuptime: {uptime}\nzerotier: {zerotier}\nprosody: {prosody}\npostgres: {postgres}\ntailscale: {tailscale}'
|
status_message = (
|
||||||
|
f'hostname: {hostname}\n'
|
||||||
|
f'system status: {status}\n'
|
||||||
|
f'uptime: {uptime}\n'
|
||||||
|
f'zerotier: {zerotier}\n'
|
||||||
|
f'prosody: {prosody}\n'
|
||||||
|
f'postgres: {postgres}\n'
|
||||||
|
f'tailscale: {tailscale}\n'
|
||||||
|
f'nginx: {nginx}'
|
||||||
|
)
|
||||||
bot.reply_to(message, status_message)
|
bot.reply_to(message, status_message)
|
||||||
bot.reply_to(message, f'filesystem info for {hostname}: \n\n{disk}')
|
bot.reply_to(message, f'Filesystem info for {hostname}:\n\n{disk}')
|
||||||
elif message.text == '/reboot':
|
elif message.text == f'/restart {hostname}' and user_id in authorized_users:
|
||||||
bot.reply_to(message, 'work in progress...')
|
bot.send_message(message.chat.id, 'Select a service to restart:', reply_markup=restart_menu())
|
||||||
elif message.text == '/ping':
|
elif message.text == f'/reboot {hostname}' and user_id in authorized_users:
|
||||||
check_tailscale()
|
bot.reply_to(message, f'Rebooting {hostname}...')
|
||||||
bot.reply_to(message, f'ping status: {ping}')
|
reboot()
|
||||||
|
elif message.text == '/ping' and user_id in authorized_users:
|
||||||
|
reach = check_tailscale_nodes()
|
||||||
|
bot.reply_to(message, f'Ping status:\n\n{"\n".join(reach)}')
|
||||||
|
else:
|
||||||
|
bot.reply_to(message, 'You are not authorized for this action')
|
||||||
|
|
||||||
|
# Polling with timeout and error handling
|
||||||
|
try:
|
||||||
|
bot.polling(none_stop=True, timeout=60, long_polling_timeout=60)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f'Polling error: {e}')
|
||||||
|
|
||||||
#polling
|
|
||||||
bot.polling()
|
|
||||||
Loading…
Add table
Reference in a new issue