233 lines
10 KiB
Python
Executable file
233 lines
10 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
Audio Organizer with Free Spectral Fingerprint API Integration
|
||
|
||
Ein robustes, zukunftsorientiertes Skript zur Analyse, Kategorisierung und Vorbereitung
|
||
von Audio-Dateien. Integriert lokale Metadaten-Extraktion, freie AcoustID/Chromaprint-API
|
||
für Spektralanalyse, MusicBrainz-Fallback, Duplikaterkennung und resilientem HTTP-Client mit Retries.
|
||
Optional: mmap-basiertes Hashing nur wenn per Umgebungsvariable MMAP_ENABLED=1 gesetzt.
|
||
"""
|
||
import os
|
||
import time
|
||
import hashlib
|
||
import logging
|
||
import argparse
|
||
import requests
|
||
import shutil
|
||
from pathlib import Path
|
||
import mutagen
|
||
import musicbrainzngs
|
||
import chromaprint
|
||
from concurrent.futures import ThreadPoolExecutor
|
||
import threading
|
||
from tqdm import tqdm
|
||
from requests.adapters import HTTPAdapter
|
||
from urllib3.util.retry import Retry
|
||
import mmap
|
||
from functools import lru_cache
|
||
|
||
logger = logging.getLogger('audio_organizer')
|
||
|
||
MB_APP_NAME = os.getenv('MB_APP_NAME', 'AudioOrganizerCorp')
|
||
MB_APP_VERSION = os.getenv('MB_APP_VERSION', '2.1')
|
||
MB_APP_CONTACT = os.getenv('MB_APP_CONTACT', 'contact@example.com')
|
||
ACOUSTID_LOOKUP_URL = 'https://api.acoustid.org/v2/lookup'
|
||
ACOUSTID_API_KEY = os.getenv('ACOUSTID_API_KEY', '')
|
||
HASH_ALGO = os.getenv('HASH_ALGO', 'md5').lower()
|
||
BLOCK_SIZE = int(os.getenv('HASH_BLOCK_SIZE', '65536'))
|
||
RATE_LIMIT_DELAY = float(os.getenv('RATE_LIMIT_DELAY', '1.0'))
|
||
MMAP_THRESHOLD = int(os.getenv('MMAP_THRESHOLD', str(100 * 1024 * 1024))) # 100MB
|
||
MMAP_ENABLED = os.getenv('MMAP_ENABLED', '0') == '1'
|
||
|
||
class AudioOrganizer:
|
||
def __init__(self, source_dir, target_dir, duplicates_log, workers=None):
|
||
self.source = Path(source_dir)
|
||
self.target = Path(target_dir)
|
||
self.duplicates_log = Path(duplicates_log)
|
||
self._validate_directories()
|
||
self.supported_exts = {'.mp3', '.flac', '.wav', '.aac', '.ogg', '.m4a'}
|
||
self.seen_hashes = {}
|
||
self.duplicates = []
|
||
self.lock = threading.Lock()
|
||
self.workers = workers or os.cpu_count()
|
||
self.session = requests.Session()
|
||
retry = Retry(total=3, backoff_factor=0.3, status_forcelist=[500,502,503,504], allowed_methods=["GET","POST"])
|
||
adapter = HTTPAdapter(max_retries=retry)
|
||
self.session.mount('https://', adapter)
|
||
self.session.mount('http://', adapter)
|
||
if not ACOUSTID_API_KEY:
|
||
logger.warning("ACOUSTID_API_KEY nicht gesetzt; HTTP-Lookup wird übersprungen.")
|
||
|
||
def _validate_directories(self):
|
||
if not self.source.is_dir() or not os.access(self.source, os.R_OK):
|
||
raise ValueError(f"Quellverzeichnis nicht zugänglich: {self.source}")
|
||
self.target.mkdir(parents=True, exist_ok=True)
|
||
if not self.target.is_dir() or not os.access(self.target, os.W_OK):
|
||
raise PermissionError(f"Zielverzeichnis nicht beschreibbar: {self.target}")
|
||
|
||
def compute_hash(self, filepath):
|
||
hasher = hashlib.blake2b() if HASH_ALGO=='blake2b' else hashlib.md5()
|
||
size = filepath.stat().st_size
|
||
use_mmap = MMAP_ENABLED and size >= MMAP_THRESHOLD
|
||
with filepath.open('rb') as f:
|
||
if use_mmap:
|
||
mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
||
hasher.update(mm)
|
||
mm.close()
|
||
else:
|
||
for chunk in iter(lambda: f.read(BLOCK_SIZE), b""):
|
||
hasher.update(chunk)
|
||
return hasher.hexdigest()
|
||
|
||
def process_file(self, path):
|
||
try:
|
||
if path.suffix.lower() not in self.supported_exts:
|
||
return
|
||
file_hash = self.compute_hash(path)
|
||
with self.lock:
|
||
if file_hash in self.seen_hashes:
|
||
self.duplicates.append(path)
|
||
return
|
||
self.seen_hashes[file_hash] = path
|
||
album, artist, title = self.extract_metadata(path)
|
||
dest = self._prepare_destination(path, album)
|
||
shutil.move(str(path), str(dest))
|
||
except Exception as e:
|
||
logger.error(f"Fehler bei Datei {path}: {e}")
|
||
|
||
def extract_metadata(self, filepath):
|
||
tags = mutagen.File(filepath, easy=True)
|
||
if tags and tags.tags and tags.tags.get('album'):
|
||
return tags.tags['album'][0], tags.tags.get('artist',[None])[0], tags.tags.get('title',[None])[0]
|
||
if ACOUSTID_API_KEY:
|
||
res = self.acoustid_lookup_http(filepath)
|
||
if res:
|
||
return res
|
||
return self.fetch_metadata_online(filepath.stem)
|
||
|
||
def acoustid_lookup_http(self, filepath):
|
||
# Adaptives Backoff und erweitertes Fehler-/Timeout-Handling
|
||
min_delay = RATE_LIMIT_DELAY
|
||
max_delay = 20.0
|
||
delay = min_delay
|
||
max_retries = 5
|
||
tries = 0
|
||
|
||
while tries < max_retries:
|
||
time.sleep(delay)
|
||
try:
|
||
try:
|
||
duration, fp = chromaprint.fingerprint_file(str(filepath))
|
||
except chromaprint.FingerprintError as e:
|
||
logger.error(f"Chromaprint-Error für {filepath.name}: {e}")
|
||
return None
|
||
resp = self.session.get(
|
||
ACOUSTID_LOOKUP_URL,
|
||
params={'client':ACOUSTID_API_KEY, 'duration':duration, 'fingerprint':fp, 'meta':'recordings'},
|
||
timeout=15
|
||
)
|
||
if resp.status_code == 429:
|
||
logger.warning(f"Rate-Limit von AcoustID API erreicht (HTTP 429), Backoff erhöht auf {min(max_delay, delay * 2):.1f}s.")
|
||
delay = min(max_delay, delay * 2)
|
||
tries += 1
|
||
continue
|
||
resp.raise_for_status()
|
||
json_data = resp.json()
|
||
if 'results' not in json_data or not json_data['results']:
|
||
logger.warning(f"Leere oder ungültige AcoustID-Antwort für {filepath.name}: {json_data}")
|
||
return None
|
||
recs = json_data['results'][0].get('recordings')
|
||
if not recs:
|
||
return None
|
||
rec = recs[0]
|
||
album = self._fetch_album_from_mbid(rec.get('id'))
|
||
return rec.get('title'), rec.get('artists', [{}])[0].get('name'), album
|
||
except requests.Timeout:
|
||
logger.warning(f"Timeout bei AcoustID-Anfrage ({filepath.name}), nächster Versuch in {min(max_delay, delay*2):.1f}s.")
|
||
delay = min(max_delay, delay * 2)
|
||
tries += 1
|
||
except requests.RequestException as e:
|
||
logger.error(f"Netzwerkfehler bei AcoustID: {e} ({filepath.name})")
|
||
break
|
||
except Exception as e:
|
||
logger.error(f"Unerwarteter Fehler bei AcoustID-Lookup: {e} ({filepath.name})")
|
||
break
|
||
|
||
logger.error(f"Maximale Wiederholungsversuche für AcoustID API bei {filepath.name} erreicht – übersprungen.")
|
||
return None
|
||
|
||
|
||
def _fetch_album_from_mbid(self, mbid):
|
||
if not mbid:
|
||
return None
|
||
try:
|
||
rec = musicbrainzngs.get_recording_by_id(mbid, includes=['releases'])
|
||
rels = rec['recording'].get('release-list', [])
|
||
return rels[0]['title'] if rels else None
|
||
except:
|
||
return None
|
||
|
||
@lru_cache(maxsize=128)
|
||
def fetch_metadata_online(self, stem):
|
||
try:
|
||
res = musicbrainzngs.search_recordings(query=stem.replace('_',' '), limit=1)
|
||
rec = res.get('recording-list',[None])[0]
|
||
if not rec:
|
||
return (None,None,None)
|
||
return (rec['release-list'][0]['title'], rec['artist-credit-phrase'], rec['title'])
|
||
except Exception as e:
|
||
logger.error(f"MusicBrainz fallback failed: {e}")
|
||
return (None,None,None)
|
||
|
||
def _prepare_destination(self, path, album):
|
||
letter = album[0].upper() if album and album[0].isalpha() else '_'
|
||
folder = self.target / letter / (album or 'Unknown Album')
|
||
folder.mkdir(parents=True, exist_ok=True)
|
||
dest = folder / path.name
|
||
i = 1
|
||
while dest.exists():
|
||
base, ext = path.stem, path.suffix
|
||
dest = folder / f"{base}_{i}{ext}"
|
||
i += 1
|
||
return dest
|
||
|
||
def organize(self):
|
||
total = sum(1 for _,_,files in os.walk(self.source) for _ in files)
|
||
try:
|
||
with ThreadPoolExecutor(max_workers=self.workers) as executor:
|
||
paths = (Path(r)/f for r,_,files in os.walk(self.source) for f in files)
|
||
for _ in tqdm(executor.map(self.process_file, paths), total=total, desc="Verarbeite Audio-Dateien"):
|
||
pass
|
||
except KeyboardInterrupt:
|
||
logger.warning("Abbruch durch Benutzer, beende...")
|
||
finally:
|
||
if self.duplicates:
|
||
self.duplicates_log.parent.mkdir(parents=True, exist_ok=True)
|
||
self.duplicates_log.write_text("\n".join(str(p) for p in self.duplicates), encoding='utf-8')
|
||
self.session.close()
|
||
|
||
if __name__=='__main__':
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument('source')
|
||
parser.add_argument('target')
|
||
parser.add_argument('--duplicates-log', default='duplicates.txt')
|
||
cpu_max = os.cpu_count() or 1
|
||
parser.add_argument('--workers', type=int, choices=range(1, cpu_max+1), metavar=f'[1-{cpu_max}]')
|
||
parser.add_argument('--verbose', action='store_true')
|
||
parser.add_argument('--mmap', action='store_true', help='Aktiviere mmap-basiertes Hashing für große Dateien (überschreibt MMAP_ENABLED)')
|
||
args = parser.parse_args()
|
||
|
||
level = logging.DEBUG if args.verbose else getattr(logging, os.getenv('LOG_LEVEL','INFO').upper(), logging.INFO)
|
||
handler = logging.StreamHandler()
|
||
handler.setLevel(level)
|
||
handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", "%Y-%m-%d %H:%M:%S"))
|
||
logger.addHandler(handler)
|
||
logger.setLevel(level)
|
||
logger.propagate = False
|
||
|
||
# mmap-Option dynamisch setzen (CLI hat Vorrang vor Umgebungsvariable)
|
||
if args.mmap:
|
||
MMAP_ENABLED = True
|
||
|
||
musicbrainzngs.set_useragent(MB_APP_NAME, f"{MB_APP_VERSION} (https://example.com)", MB_APP_CONTACT)
|
||
AudioOrganizer(args.source, args.target, args.duplicates_log, workers=args.workers).organize()
|