#!/usr/bin/env python3 """ Audio Organizer with Free Spectral Fingerprint API Integration Ein robustes, zukunftsorientiertes Skript zur Analyse, Kategorisierung und Vorbereitung von Audio-Dateien. Integriert lokale Metadaten-Extraktion, freie AcoustID/Chromaprint-API für Spektralanalyse, MusicBrainz-Fallback, Duplikaterkennung und resilientem HTTP-Client mit Retries. Optional: mmap-basiertes Hashing nur wenn per Umgebungsvariable MMAP_ENABLED=1 gesetzt. """ import os import time import hashlib import logging import argparse import requests import shutil from pathlib import Path import mutagen import musicbrainzngs import chromaprint from concurrent.futures import ThreadPoolExecutor import threading from tqdm import tqdm from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry import mmap from functools import lru_cache logger = logging.getLogger('audio_organizer') MB_APP_NAME = os.getenv('MB_APP_NAME', 'AudioOrganizerCorp') MB_APP_VERSION = os.getenv('MB_APP_VERSION', '2.1') MB_APP_CONTACT = os.getenv('MB_APP_CONTACT', 'contact@example.com') ACOUSTID_LOOKUP_URL = 'https://api.acoustid.org/v2/lookup' ACOUSTID_API_KEY = os.getenv('ACOUSTID_API_KEY', '') HASH_ALGO = os.getenv('HASH_ALGO', 'md5').lower() BLOCK_SIZE = int(os.getenv('HASH_BLOCK_SIZE', '65536')) RATE_LIMIT_DELAY = float(os.getenv('RATE_LIMIT_DELAY', '1.0')) MMAP_THRESHOLD = int(os.getenv('MMAP_THRESHOLD', str(100 * 1024 * 1024))) # 100MB MMAP_ENABLED = os.getenv('MMAP_ENABLED', '0') == '1' class AudioOrganizer: def __init__(self, source_dir, target_dir, duplicates_log, workers=None): self.source = Path(source_dir) self.target = Path(target_dir) self.duplicates_log = Path(duplicates_log) self._validate_directories() self.supported_exts = {'.mp3', '.flac', '.wav', '.aac', '.ogg', '.m4a'} self.seen_hashes = {} self.duplicates = [] self.lock = threading.Lock() self.workers = workers or os.cpu_count() self.session = requests.Session() retry = Retry(total=3, backoff_factor=0.3, status_forcelist=[500,502,503,504], allowed_methods=["GET","POST"]) adapter = HTTPAdapter(max_retries=retry) self.session.mount('https://', adapter) self.session.mount('http://', adapter) if not ACOUSTID_API_KEY: logger.warning("ACOUSTID_API_KEY nicht gesetzt; HTTP-Lookup wird übersprungen.") def _validate_directories(self): if not self.source.is_dir() or not os.access(self.source, os.R_OK): raise ValueError(f"Quellverzeichnis nicht zugänglich: {self.source}") self.target.mkdir(parents=True, exist_ok=True) if not self.target.is_dir() or not os.access(self.target, os.W_OK): raise PermissionError(f"Zielverzeichnis nicht beschreibbar: {self.target}") def compute_hash(self, filepath): hasher = hashlib.blake2b() if HASH_ALGO=='blake2b' else hashlib.md5() size = filepath.stat().st_size use_mmap = MMAP_ENABLED and size >= MMAP_THRESHOLD with filepath.open('rb') as f: if use_mmap: mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) hasher.update(mm) mm.close() else: for chunk in iter(lambda: f.read(BLOCK_SIZE), b""): hasher.update(chunk) return hasher.hexdigest() def process_file(self, path): try: if path.suffix.lower() not in self.supported_exts: return file_hash = self.compute_hash(path) with self.lock: if file_hash in self.seen_hashes: self.duplicates.append(path) return self.seen_hashes[file_hash] = path album, artist, title = self.extract_metadata(path) dest = self._prepare_destination(path, album) shutil.move(str(path), str(dest)) except Exception as e: logger.error(f"Fehler bei Datei {path}: {e}") def extract_metadata(self, filepath): tags = mutagen.File(filepath, easy=True) if tags and tags.tags and tags.tags.get('album'): return tags.tags['album'][0], tags.tags.get('artist',[None])[0], tags.tags.get('title',[None])[0] if ACOUSTID_API_KEY: res = self.acoustid_lookup_http(filepath) if res: return res return self.fetch_metadata_online(filepath.stem) def acoustid_lookup_http(self, filepath): # Adaptives Backoff und erweitertes Fehler-/Timeout-Handling min_delay = RATE_LIMIT_DELAY max_delay = 20.0 delay = min_delay max_retries = 5 tries = 0 while tries < max_retries: time.sleep(delay) try: try: duration, fp = chromaprint.fingerprint_file(str(filepath)) except chromaprint.FingerprintError as e: logger.error(f"Chromaprint-Error für {filepath.name}: {e}") return None resp = self.session.get( ACOUSTID_LOOKUP_URL, params={'client':ACOUSTID_API_KEY, 'duration':duration, 'fingerprint':fp, 'meta':'recordings'}, timeout=15 ) if resp.status_code == 429: logger.warning(f"Rate-Limit von AcoustID API erreicht (HTTP 429), Backoff erhöht auf {min(max_delay, delay * 2):.1f}s.") delay = min(max_delay, delay * 2) tries += 1 continue resp.raise_for_status() json_data = resp.json() if 'results' not in json_data or not json_data['results']: logger.warning(f"Leere oder ungültige AcoustID-Antwort für {filepath.name}: {json_data}") return None recs = json_data['results'][0].get('recordings') if not recs: return None rec = recs[0] album = self._fetch_album_from_mbid(rec.get('id')) return rec.get('title'), rec.get('artists', [{}])[0].get('name'), album except requests.Timeout: logger.warning(f"Timeout bei AcoustID-Anfrage ({filepath.name}), nächster Versuch in {min(max_delay, delay*2):.1f}s.") delay = min(max_delay, delay * 2) tries += 1 except requests.RequestException as e: logger.error(f"Netzwerkfehler bei AcoustID: {e} ({filepath.name})") break except Exception as e: logger.error(f"Unerwarteter Fehler bei AcoustID-Lookup: {e} ({filepath.name})") break logger.error(f"Maximale Wiederholungsversuche für AcoustID API bei {filepath.name} erreicht – übersprungen.") return None def _fetch_album_from_mbid(self, mbid): if not mbid: return None try: rec = musicbrainzngs.get_recording_by_id(mbid, includes=['releases']) rels = rec['recording'].get('release-list', []) return rels[0]['title'] if rels else None except: return None @lru_cache(maxsize=128) def fetch_metadata_online(self, stem): try: res = musicbrainzngs.search_recordings(query=stem.replace('_',' '), limit=1) rec = res.get('recording-list',[None])[0] if not rec: return (None,None,None) return (rec['release-list'][0]['title'], rec['artist-credit-phrase'], rec['title']) except Exception as e: logger.error(f"MusicBrainz fallback failed: {e}") return (None,None,None) def _prepare_destination(self, path, album): letter = album[0].upper() if album and album[0].isalpha() else '_' folder = self.target / letter / (album or 'Unknown Album') folder.mkdir(parents=True, exist_ok=True) dest = folder / path.name i = 1 while dest.exists(): base, ext = path.stem, path.suffix dest = folder / f"{base}_{i}{ext}" i += 1 return dest def organize(self): total = sum(1 for _,_,files in os.walk(self.source) for _ in files) try: with ThreadPoolExecutor(max_workers=self.workers) as executor: paths = (Path(r)/f for r,_,files in os.walk(self.source) for f in files) for _ in tqdm(executor.map(self.process_file, paths), total=total, desc="Verarbeite Audio-Dateien"): pass except KeyboardInterrupt: logger.warning("Abbruch durch Benutzer, beende...") finally: if self.duplicates: self.duplicates_log.parent.mkdir(parents=True, exist_ok=True) self.duplicates_log.write_text("\n".join(str(p) for p in self.duplicates), encoding='utf-8') self.session.close() if __name__=='__main__': parser = argparse.ArgumentParser() parser.add_argument('source') parser.add_argument('target') parser.add_argument('--duplicates-log', default='duplicates.txt') cpu_max = os.cpu_count() or 1 parser.add_argument('--workers', type=int, choices=range(1, cpu_max+1), metavar=f'[1-{cpu_max}]') parser.add_argument('--verbose', action='store_true') parser.add_argument('--mmap', action='store_true', help='Aktiviere mmap-basiertes Hashing für große Dateien (überschreibt MMAP_ENABLED)') args = parser.parse_args() level = logging.DEBUG if args.verbose else getattr(logging, os.getenv('LOG_LEVEL','INFO').upper(), logging.INFO) handler = logging.StreamHandler() handler.setLevel(level) handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", "%Y-%m-%d %H:%M:%S")) logger.addHandler(handler) logger.setLevel(level) logger.propagate = False # mmap-Option dynamisch setzen (CLI hat Vorrang vor Umgebungsvariable) if args.mmap: MMAP_ENABLED = True musicbrainzngs.set_useragent(MB_APP_NAME, f"{MB_APP_VERSION} (https://example.com)", MB_APP_CONTACT) AudioOrganizer(args.source, args.target, args.duplicates_log, workers=args.workers).organize()