New tool added and better structured repo

This commit is contained in:
0n1cOn3 2025-07-07 04:48:40 +02:00
parent 0d3441e0e4
commit 3691c50bb9
5 changed files with 239 additions and 0 deletions

View file

@ -0,0 +1,233 @@
#!/usr/bin/env python3
"""
Audio Organizer with Free Spectral Fingerprint API Integration
Ein robustes, zukunftsorientiertes Skript zur Analyse, Kategorisierung und Vorbereitung
von Audio-Dateien. Integriert lokale Metadaten-Extraktion, freie AcoustID/Chromaprint-API
für Spektralanalyse, MusicBrainz-Fallback, Duplikaterkennung und resilientem HTTP-Client mit Retries.
Optional: mmap-basiertes Hashing nur wenn per Umgebungsvariable MMAP_ENABLED=1 gesetzt.
"""
import os
import time
import hashlib
import logging
import argparse
import requests
import shutil
from pathlib import Path
import mutagen
import musicbrainzngs
import chromaprint
from concurrent.futures import ThreadPoolExecutor
import threading
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import mmap
from functools import lru_cache
logger = logging.getLogger('audio_organizer')
MB_APP_NAME = os.getenv('MB_APP_NAME', 'AudioOrganizerCorp')
MB_APP_VERSION = os.getenv('MB_APP_VERSION', '2.1')
MB_APP_CONTACT = os.getenv('MB_APP_CONTACT', 'contact@example.com')
ACOUSTID_LOOKUP_URL = 'https://api.acoustid.org/v2/lookup'
ACOUSTID_API_KEY = os.getenv('ACOUSTID_API_KEY', '')
HASH_ALGO = os.getenv('HASH_ALGO', 'md5').lower()
BLOCK_SIZE = int(os.getenv('HASH_BLOCK_SIZE', '65536'))
RATE_LIMIT_DELAY = float(os.getenv('RATE_LIMIT_DELAY', '1.0'))
MMAP_THRESHOLD = int(os.getenv('MMAP_THRESHOLD', str(100 * 1024 * 1024))) # 100MB
MMAP_ENABLED = os.getenv('MMAP_ENABLED', '0') == '1'
class AudioOrganizer:
def __init__(self, source_dir, target_dir, duplicates_log, workers=None):
self.source = Path(source_dir)
self.target = Path(target_dir)
self.duplicates_log = Path(duplicates_log)
self._validate_directories()
self.supported_exts = {'.mp3', '.flac', '.wav', '.aac', '.ogg', '.m4a'}
self.seen_hashes = {}
self.duplicates = []
self.lock = threading.Lock()
self.workers = workers or os.cpu_count()
self.session = requests.Session()
retry = Retry(total=3, backoff_factor=0.3, status_forcelist=[500,502,503,504], allowed_methods=["GET","POST"])
adapter = HTTPAdapter(max_retries=retry)
self.session.mount('https://', adapter)
self.session.mount('http://', adapter)
if not ACOUSTID_API_KEY:
logger.warning("ACOUSTID_API_KEY nicht gesetzt; HTTP-Lookup wird übersprungen.")
def _validate_directories(self):
if not self.source.is_dir() or not os.access(self.source, os.R_OK):
raise ValueError(f"Quellverzeichnis nicht zugänglich: {self.source}")
self.target.mkdir(parents=True, exist_ok=True)
if not self.target.is_dir() or not os.access(self.target, os.W_OK):
raise PermissionError(f"Zielverzeichnis nicht beschreibbar: {self.target}")
def compute_hash(self, filepath):
hasher = hashlib.blake2b() if HASH_ALGO=='blake2b' else hashlib.md5()
size = filepath.stat().st_size
use_mmap = MMAP_ENABLED and size >= MMAP_THRESHOLD
with filepath.open('rb') as f:
if use_mmap:
mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
hasher.update(mm)
mm.close()
else:
for chunk in iter(lambda: f.read(BLOCK_SIZE), b""):
hasher.update(chunk)
return hasher.hexdigest()
def process_file(self, path):
try:
if path.suffix.lower() not in self.supported_exts:
return
file_hash = self.compute_hash(path)
with self.lock:
if file_hash in self.seen_hashes:
self.duplicates.append(path)
return
self.seen_hashes[file_hash] = path
album, artist, title = self.extract_metadata(path)
dest = self._prepare_destination(path, album)
shutil.move(str(path), str(dest))
except Exception as e:
logger.error(f"Fehler bei Datei {path}: {e}")
def extract_metadata(self, filepath):
tags = mutagen.File(filepath, easy=True)
if tags and tags.tags and tags.tags.get('album'):
return tags.tags['album'][0], tags.tags.get('artist',[None])[0], tags.tags.get('title',[None])[0]
if ACOUSTID_API_KEY:
res = self.acoustid_lookup_http(filepath)
if res:
return res
return self.fetch_metadata_online(filepath.stem)
def acoustid_lookup_http(self, filepath):
# Adaptives Backoff und erweitertes Fehler-/Timeout-Handling
min_delay = RATE_LIMIT_DELAY
max_delay = 20.0
delay = min_delay
max_retries = 5
tries = 0
while tries < max_retries:
time.sleep(delay)
try:
try:
duration, fp = chromaprint.fingerprint_file(str(filepath))
except chromaprint.FingerprintError as e:
logger.error(f"Chromaprint-Error für {filepath.name}: {e}")
return None
resp = self.session.get(
ACOUSTID_LOOKUP_URL,
params={'client':ACOUSTID_API_KEY, 'duration':duration, 'fingerprint':fp, 'meta':'recordings'},
timeout=15
)
if resp.status_code == 429:
logger.warning(f"Rate-Limit von AcoustID API erreicht (HTTP 429), Backoff erhöht auf {min(max_delay, delay * 2):.1f}s.")
delay = min(max_delay, delay * 2)
tries += 1
continue
resp.raise_for_status()
json_data = resp.json()
if 'results' not in json_data or not json_data['results']:
logger.warning(f"Leere oder ungültige AcoustID-Antwort für {filepath.name}: {json_data}")
return None
recs = json_data['results'][0].get('recordings')
if not recs:
return None
rec = recs[0]
album = self._fetch_album_from_mbid(rec.get('id'))
return rec.get('title'), rec.get('artists', [{}])[0].get('name'), album
except requests.Timeout:
logger.warning(f"Timeout bei AcoustID-Anfrage ({filepath.name}), nächster Versuch in {min(max_delay, delay*2):.1f}s.")
delay = min(max_delay, delay * 2)
tries += 1
except requests.RequestException as e:
logger.error(f"Netzwerkfehler bei AcoustID: {e} ({filepath.name})")
break
except Exception as e:
logger.error(f"Unerwarteter Fehler bei AcoustID-Lookup: {e} ({filepath.name})")
break
logger.error(f"Maximale Wiederholungsversuche für AcoustID API bei {filepath.name} erreicht übersprungen.")
return None
def _fetch_album_from_mbid(self, mbid):
if not mbid:
return None
try:
rec = musicbrainzngs.get_recording_by_id(mbid, includes=['releases'])
rels = rec['recording'].get('release-list', [])
return rels[0]['title'] if rels else None
except:
return None
@lru_cache(maxsize=128)
def fetch_metadata_online(self, stem):
try:
res = musicbrainzngs.search_recordings(query=stem.replace('_',' '), limit=1)
rec = res.get('recording-list',[None])[0]
if not rec:
return (None,None,None)
return (rec['release-list'][0]['title'], rec['artist-credit-phrase'], rec['title'])
except Exception as e:
logger.error(f"MusicBrainz fallback failed: {e}")
return (None,None,None)
def _prepare_destination(self, path, album):
letter = album[0].upper() if album and album[0].isalpha() else '_'
folder = self.target / letter / (album or 'Unknown Album')
folder.mkdir(parents=True, exist_ok=True)
dest = folder / path.name
i = 1
while dest.exists():
base, ext = path.stem, path.suffix
dest = folder / f"{base}_{i}{ext}"
i += 1
return dest
def organize(self):
total = sum(1 for _,_,files in os.walk(self.source) for _ in files)
try:
with ThreadPoolExecutor(max_workers=self.workers) as executor:
paths = (Path(r)/f for r,_,files in os.walk(self.source) for f in files)
for _ in tqdm(executor.map(self.process_file, paths), total=total, desc="Verarbeite Audio-Dateien"):
pass
except KeyboardInterrupt:
logger.warning("Abbruch durch Benutzer, beende...")
finally:
if self.duplicates:
self.duplicates_log.parent.mkdir(parents=True, exist_ok=True)
self.duplicates_log.write_text("\n".join(str(p) for p in self.duplicates), encoding='utf-8')
self.session.close()
if __name__=='__main__':
parser = argparse.ArgumentParser()
parser.add_argument('source')
parser.add_argument('target')
parser.add_argument('--duplicates-log', default='duplicates.txt')
cpu_max = os.cpu_count() or 1
parser.add_argument('--workers', type=int, choices=range(1, cpu_max+1), metavar=f'[1-{cpu_max}]')
parser.add_argument('--verbose', action='store_true')
parser.add_argument('--mmap', action='store_true', help='Aktiviere mmap-basiertes Hashing für große Dateien (überschreibt MMAP_ENABLED)')
args = parser.parse_args()
level = logging.DEBUG if args.verbose else getattr(logging, os.getenv('LOG_LEVEL','INFO').upper(), logging.INFO)
handler = logging.StreamHandler()
handler.setLevel(level)
handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", "%Y-%m-%d %H:%M:%S"))
logger.addHandler(handler)
logger.setLevel(level)
logger.propagate = False
# mmap-Option dynamisch setzen (CLI hat Vorrang vor Umgebungsvariable)
if args.mmap:
MMAP_ENABLED = True
musicbrainzngs.set_useragent(MB_APP_NAME, f"{MB_APP_VERSION} (https://example.com)", MB_APP_CONTACT)
AudioOrganizer(args.source, args.target, args.duplicates_log, workers=args.workers).organize()

View file

@ -0,0 +1,6 @@
mutagen
musicbrainzngs
chromaprint
requests
tqdm
urllib3