From 3691c50bb90862d67b4a9df12079fe9a308d92f3 Mon Sep 17 00:00:00 2001 From: 0n1cOn3 <0n1cOn3@gmx.ch> Date: Mon, 7 Jul 2025 04:48:40 +0200 Subject: [PATCH] New tool added and better structured repo --- flac2m4a => Convert/flac2m4a | 0 wave2m4a => Convert/wave2m4a | 0 .../File Organisation/audio-organizer.py | 233 ++++++++++++++++++ Filesystem/File Organisation/requirements.txt | 6 + typechecker => Filesystem/typechecker | 0 5 files changed, 239 insertions(+) rename flac2m4a => Convert/flac2m4a (100%) rename wave2m4a => Convert/wave2m4a (100%) create mode 100755 Filesystem/File Organisation/audio-organizer.py create mode 100644 Filesystem/File Organisation/requirements.txt rename typechecker => Filesystem/typechecker (100%) diff --git a/flac2m4a b/Convert/flac2m4a similarity index 100% rename from flac2m4a rename to Convert/flac2m4a diff --git a/wave2m4a b/Convert/wave2m4a similarity index 100% rename from wave2m4a rename to Convert/wave2m4a diff --git a/Filesystem/File Organisation/audio-organizer.py b/Filesystem/File Organisation/audio-organizer.py new file mode 100755 index 0000000..b9cb367 --- /dev/null +++ b/Filesystem/File Organisation/audio-organizer.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 +""" +Audio Organizer with Free Spectral Fingerprint API Integration + +Ein robustes, zukunftsorientiertes Skript zur Analyse, Kategorisierung und Vorbereitung +von Audio-Dateien. Integriert lokale Metadaten-Extraktion, freie AcoustID/Chromaprint-API +für Spektralanalyse, MusicBrainz-Fallback, Duplikaterkennung und resilientem HTTP-Client mit Retries. +Optional: mmap-basiertes Hashing nur wenn per Umgebungsvariable MMAP_ENABLED=1 gesetzt. +""" +import os +import time +import hashlib +import logging +import argparse +import requests +import shutil +from pathlib import Path +import mutagen +import musicbrainzngs +import chromaprint +from concurrent.futures import ThreadPoolExecutor +import threading +from tqdm import tqdm +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +import mmap +from functools import lru_cache + +logger = logging.getLogger('audio_organizer') + +MB_APP_NAME = os.getenv('MB_APP_NAME', 'AudioOrganizerCorp') +MB_APP_VERSION = os.getenv('MB_APP_VERSION', '2.1') +MB_APP_CONTACT = os.getenv('MB_APP_CONTACT', 'contact@example.com') +ACOUSTID_LOOKUP_URL = 'https://api.acoustid.org/v2/lookup' +ACOUSTID_API_KEY = os.getenv('ACOUSTID_API_KEY', '') +HASH_ALGO = os.getenv('HASH_ALGO', 'md5').lower() +BLOCK_SIZE = int(os.getenv('HASH_BLOCK_SIZE', '65536')) +RATE_LIMIT_DELAY = float(os.getenv('RATE_LIMIT_DELAY', '1.0')) +MMAP_THRESHOLD = int(os.getenv('MMAP_THRESHOLD', str(100 * 1024 * 1024))) # 100MB +MMAP_ENABLED = os.getenv('MMAP_ENABLED', '0') == '1' + +class AudioOrganizer: + def __init__(self, source_dir, target_dir, duplicates_log, workers=None): + self.source = Path(source_dir) + self.target = Path(target_dir) + self.duplicates_log = Path(duplicates_log) + self._validate_directories() + self.supported_exts = {'.mp3', '.flac', '.wav', '.aac', '.ogg', '.m4a'} + self.seen_hashes = {} + self.duplicates = [] + self.lock = threading.Lock() + self.workers = workers or os.cpu_count() + self.session = requests.Session() + retry = Retry(total=3, backoff_factor=0.3, status_forcelist=[500,502,503,504], allowed_methods=["GET","POST"]) + adapter = HTTPAdapter(max_retries=retry) + self.session.mount('https://', adapter) + self.session.mount('http://', adapter) + if not ACOUSTID_API_KEY: + logger.warning("ACOUSTID_API_KEY nicht gesetzt; HTTP-Lookup wird übersprungen.") + + def _validate_directories(self): + if not self.source.is_dir() or not os.access(self.source, os.R_OK): + raise ValueError(f"Quellverzeichnis nicht zugänglich: {self.source}") + self.target.mkdir(parents=True, exist_ok=True) + if not self.target.is_dir() or not os.access(self.target, os.W_OK): + raise PermissionError(f"Zielverzeichnis nicht beschreibbar: {self.target}") + + def compute_hash(self, filepath): + hasher = hashlib.blake2b() if HASH_ALGO=='blake2b' else hashlib.md5() + size = filepath.stat().st_size + use_mmap = MMAP_ENABLED and size >= MMAP_THRESHOLD + with filepath.open('rb') as f: + if use_mmap: + mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + hasher.update(mm) + mm.close() + else: + for chunk in iter(lambda: f.read(BLOCK_SIZE), b""): + hasher.update(chunk) + return hasher.hexdigest() + + def process_file(self, path): + try: + if path.suffix.lower() not in self.supported_exts: + return + file_hash = self.compute_hash(path) + with self.lock: + if file_hash in self.seen_hashes: + self.duplicates.append(path) + return + self.seen_hashes[file_hash] = path + album, artist, title = self.extract_metadata(path) + dest = self._prepare_destination(path, album) + shutil.move(str(path), str(dest)) + except Exception as e: + logger.error(f"Fehler bei Datei {path}: {e}") + + def extract_metadata(self, filepath): + tags = mutagen.File(filepath, easy=True) + if tags and tags.tags and tags.tags.get('album'): + return tags.tags['album'][0], tags.tags.get('artist',[None])[0], tags.tags.get('title',[None])[0] + if ACOUSTID_API_KEY: + res = self.acoustid_lookup_http(filepath) + if res: + return res + return self.fetch_metadata_online(filepath.stem) + + def acoustid_lookup_http(self, filepath): + # Adaptives Backoff und erweitertes Fehler-/Timeout-Handling + min_delay = RATE_LIMIT_DELAY + max_delay = 20.0 + delay = min_delay + max_retries = 5 + tries = 0 + + while tries < max_retries: + time.sleep(delay) + try: + try: + duration, fp = chromaprint.fingerprint_file(str(filepath)) + except chromaprint.FingerprintError as e: + logger.error(f"Chromaprint-Error für {filepath.name}: {e}") + return None + resp = self.session.get( + ACOUSTID_LOOKUP_URL, + params={'client':ACOUSTID_API_KEY, 'duration':duration, 'fingerprint':fp, 'meta':'recordings'}, + timeout=15 + ) + if resp.status_code == 429: + logger.warning(f"Rate-Limit von AcoustID API erreicht (HTTP 429), Backoff erhöht auf {min(max_delay, delay * 2):.1f}s.") + delay = min(max_delay, delay * 2) + tries += 1 + continue + resp.raise_for_status() + json_data = resp.json() + if 'results' not in json_data or not json_data['results']: + logger.warning(f"Leere oder ungültige AcoustID-Antwort für {filepath.name}: {json_data}") + return None + recs = json_data['results'][0].get('recordings') + if not recs: + return None + rec = recs[0] + album = self._fetch_album_from_mbid(rec.get('id')) + return rec.get('title'), rec.get('artists', [{}])[0].get('name'), album + except requests.Timeout: + logger.warning(f"Timeout bei AcoustID-Anfrage ({filepath.name}), nächster Versuch in {min(max_delay, delay*2):.1f}s.") + delay = min(max_delay, delay * 2) + tries += 1 + except requests.RequestException as e: + logger.error(f"Netzwerkfehler bei AcoustID: {e} ({filepath.name})") + break + except Exception as e: + logger.error(f"Unerwarteter Fehler bei AcoustID-Lookup: {e} ({filepath.name})") + break + + logger.error(f"Maximale Wiederholungsversuche für AcoustID API bei {filepath.name} erreicht – übersprungen.") + return None + + + def _fetch_album_from_mbid(self, mbid): + if not mbid: + return None + try: + rec = musicbrainzngs.get_recording_by_id(mbid, includes=['releases']) + rels = rec['recording'].get('release-list', []) + return rels[0]['title'] if rels else None + except: + return None + + @lru_cache(maxsize=128) + def fetch_metadata_online(self, stem): + try: + res = musicbrainzngs.search_recordings(query=stem.replace('_',' '), limit=1) + rec = res.get('recording-list',[None])[0] + if not rec: + return (None,None,None) + return (rec['release-list'][0]['title'], rec['artist-credit-phrase'], rec['title']) + except Exception as e: + logger.error(f"MusicBrainz fallback failed: {e}") + return (None,None,None) + + def _prepare_destination(self, path, album): + letter = album[0].upper() if album and album[0].isalpha() else '_' + folder = self.target / letter / (album or 'Unknown Album') + folder.mkdir(parents=True, exist_ok=True) + dest = folder / path.name + i = 1 + while dest.exists(): + base, ext = path.stem, path.suffix + dest = folder / f"{base}_{i}{ext}" + i += 1 + return dest + + def organize(self): + total = sum(1 for _,_,files in os.walk(self.source) for _ in files) + try: + with ThreadPoolExecutor(max_workers=self.workers) as executor: + paths = (Path(r)/f for r,_,files in os.walk(self.source) for f in files) + for _ in tqdm(executor.map(self.process_file, paths), total=total, desc="Verarbeite Audio-Dateien"): + pass + except KeyboardInterrupt: + logger.warning("Abbruch durch Benutzer, beende...") + finally: + if self.duplicates: + self.duplicates_log.parent.mkdir(parents=True, exist_ok=True) + self.duplicates_log.write_text("\n".join(str(p) for p in self.duplicates), encoding='utf-8') + self.session.close() + +if __name__=='__main__': + parser = argparse.ArgumentParser() + parser.add_argument('source') + parser.add_argument('target') + parser.add_argument('--duplicates-log', default='duplicates.txt') + cpu_max = os.cpu_count() or 1 + parser.add_argument('--workers', type=int, choices=range(1, cpu_max+1), metavar=f'[1-{cpu_max}]') + parser.add_argument('--verbose', action='store_true') + parser.add_argument('--mmap', action='store_true', help='Aktiviere mmap-basiertes Hashing für große Dateien (überschreibt MMAP_ENABLED)') + args = parser.parse_args() + + level = logging.DEBUG if args.verbose else getattr(logging, os.getenv('LOG_LEVEL','INFO').upper(), logging.INFO) + handler = logging.StreamHandler() + handler.setLevel(level) + handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", "%Y-%m-%d %H:%M:%S")) + logger.addHandler(handler) + logger.setLevel(level) + logger.propagate = False + + # mmap-Option dynamisch setzen (CLI hat Vorrang vor Umgebungsvariable) + if args.mmap: + MMAP_ENABLED = True + + musicbrainzngs.set_useragent(MB_APP_NAME, f"{MB_APP_VERSION} (https://example.com)", MB_APP_CONTACT) + AudioOrganizer(args.source, args.target, args.duplicates_log, workers=args.workers).organize() diff --git a/Filesystem/File Organisation/requirements.txt b/Filesystem/File Organisation/requirements.txt new file mode 100644 index 0000000..88adf76 --- /dev/null +++ b/Filesystem/File Organisation/requirements.txt @@ -0,0 +1,6 @@ +mutagen +musicbrainzngs +chromaprint +requests +tqdm +urllib3 diff --git a/typechecker b/Filesystem/typechecker similarity index 100% rename from typechecker rename to Filesystem/typechecker