New tool added and better structured repo
This commit is contained in:
parent
0d3441e0e4
commit
3691c50bb9
5 changed files with 239 additions and 0 deletions
233
Filesystem/File Organisation/audio-organizer.py
Executable file
233
Filesystem/File Organisation/audio-organizer.py
Executable file
|
|
@ -0,0 +1,233 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Audio Organizer with Free Spectral Fingerprint API Integration
|
||||
|
||||
Ein robustes, zukunftsorientiertes Skript zur Analyse, Kategorisierung und Vorbereitung
|
||||
von Audio-Dateien. Integriert lokale Metadaten-Extraktion, freie AcoustID/Chromaprint-API
|
||||
für Spektralanalyse, MusicBrainz-Fallback, Duplikaterkennung und resilientem HTTP-Client mit Retries.
|
||||
Optional: mmap-basiertes Hashing nur wenn per Umgebungsvariable MMAP_ENABLED=1 gesetzt.
|
||||
"""
|
||||
import os
|
||||
import time
|
||||
import hashlib
|
||||
import logging
|
||||
import argparse
|
||||
import requests
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
import mutagen
|
||||
import musicbrainzngs
|
||||
import chromaprint
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import threading
|
||||
from tqdm import tqdm
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
import mmap
|
||||
from functools import lru_cache
|
||||
|
||||
logger = logging.getLogger('audio_organizer')
|
||||
|
||||
MB_APP_NAME = os.getenv('MB_APP_NAME', 'AudioOrganizerCorp')
|
||||
MB_APP_VERSION = os.getenv('MB_APP_VERSION', '2.1')
|
||||
MB_APP_CONTACT = os.getenv('MB_APP_CONTACT', 'contact@example.com')
|
||||
ACOUSTID_LOOKUP_URL = 'https://api.acoustid.org/v2/lookup'
|
||||
ACOUSTID_API_KEY = os.getenv('ACOUSTID_API_KEY', '')
|
||||
HASH_ALGO = os.getenv('HASH_ALGO', 'md5').lower()
|
||||
BLOCK_SIZE = int(os.getenv('HASH_BLOCK_SIZE', '65536'))
|
||||
RATE_LIMIT_DELAY = float(os.getenv('RATE_LIMIT_DELAY', '1.0'))
|
||||
MMAP_THRESHOLD = int(os.getenv('MMAP_THRESHOLD', str(100 * 1024 * 1024))) # 100MB
|
||||
MMAP_ENABLED = os.getenv('MMAP_ENABLED', '0') == '1'
|
||||
|
||||
class AudioOrganizer:
|
||||
def __init__(self, source_dir, target_dir, duplicates_log, workers=None):
|
||||
self.source = Path(source_dir)
|
||||
self.target = Path(target_dir)
|
||||
self.duplicates_log = Path(duplicates_log)
|
||||
self._validate_directories()
|
||||
self.supported_exts = {'.mp3', '.flac', '.wav', '.aac', '.ogg', '.m4a'}
|
||||
self.seen_hashes = {}
|
||||
self.duplicates = []
|
||||
self.lock = threading.Lock()
|
||||
self.workers = workers or os.cpu_count()
|
||||
self.session = requests.Session()
|
||||
retry = Retry(total=3, backoff_factor=0.3, status_forcelist=[500,502,503,504], allowed_methods=["GET","POST"])
|
||||
adapter = HTTPAdapter(max_retries=retry)
|
||||
self.session.mount('https://', adapter)
|
||||
self.session.mount('http://', adapter)
|
||||
if not ACOUSTID_API_KEY:
|
||||
logger.warning("ACOUSTID_API_KEY nicht gesetzt; HTTP-Lookup wird übersprungen.")
|
||||
|
||||
def _validate_directories(self):
|
||||
if not self.source.is_dir() or not os.access(self.source, os.R_OK):
|
||||
raise ValueError(f"Quellverzeichnis nicht zugänglich: {self.source}")
|
||||
self.target.mkdir(parents=True, exist_ok=True)
|
||||
if not self.target.is_dir() or not os.access(self.target, os.W_OK):
|
||||
raise PermissionError(f"Zielverzeichnis nicht beschreibbar: {self.target}")
|
||||
|
||||
def compute_hash(self, filepath):
|
||||
hasher = hashlib.blake2b() if HASH_ALGO=='blake2b' else hashlib.md5()
|
||||
size = filepath.stat().st_size
|
||||
use_mmap = MMAP_ENABLED and size >= MMAP_THRESHOLD
|
||||
with filepath.open('rb') as f:
|
||||
if use_mmap:
|
||||
mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
|
||||
hasher.update(mm)
|
||||
mm.close()
|
||||
else:
|
||||
for chunk in iter(lambda: f.read(BLOCK_SIZE), b""):
|
||||
hasher.update(chunk)
|
||||
return hasher.hexdigest()
|
||||
|
||||
def process_file(self, path):
|
||||
try:
|
||||
if path.suffix.lower() not in self.supported_exts:
|
||||
return
|
||||
file_hash = self.compute_hash(path)
|
||||
with self.lock:
|
||||
if file_hash in self.seen_hashes:
|
||||
self.duplicates.append(path)
|
||||
return
|
||||
self.seen_hashes[file_hash] = path
|
||||
album, artist, title = self.extract_metadata(path)
|
||||
dest = self._prepare_destination(path, album)
|
||||
shutil.move(str(path), str(dest))
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler bei Datei {path}: {e}")
|
||||
|
||||
def extract_metadata(self, filepath):
|
||||
tags = mutagen.File(filepath, easy=True)
|
||||
if tags and tags.tags and tags.tags.get('album'):
|
||||
return tags.tags['album'][0], tags.tags.get('artist',[None])[0], tags.tags.get('title',[None])[0]
|
||||
if ACOUSTID_API_KEY:
|
||||
res = self.acoustid_lookup_http(filepath)
|
||||
if res:
|
||||
return res
|
||||
return self.fetch_metadata_online(filepath.stem)
|
||||
|
||||
def acoustid_lookup_http(self, filepath):
|
||||
# Adaptives Backoff und erweitertes Fehler-/Timeout-Handling
|
||||
min_delay = RATE_LIMIT_DELAY
|
||||
max_delay = 20.0
|
||||
delay = min_delay
|
||||
max_retries = 5
|
||||
tries = 0
|
||||
|
||||
while tries < max_retries:
|
||||
time.sleep(delay)
|
||||
try:
|
||||
try:
|
||||
duration, fp = chromaprint.fingerprint_file(str(filepath))
|
||||
except chromaprint.FingerprintError as e:
|
||||
logger.error(f"Chromaprint-Error für {filepath.name}: {e}")
|
||||
return None
|
||||
resp = self.session.get(
|
||||
ACOUSTID_LOOKUP_URL,
|
||||
params={'client':ACOUSTID_API_KEY, 'duration':duration, 'fingerprint':fp, 'meta':'recordings'},
|
||||
timeout=15
|
||||
)
|
||||
if resp.status_code == 429:
|
||||
logger.warning(f"Rate-Limit von AcoustID API erreicht (HTTP 429), Backoff erhöht auf {min(max_delay, delay * 2):.1f}s.")
|
||||
delay = min(max_delay, delay * 2)
|
||||
tries += 1
|
||||
continue
|
||||
resp.raise_for_status()
|
||||
json_data = resp.json()
|
||||
if 'results' not in json_data or not json_data['results']:
|
||||
logger.warning(f"Leere oder ungültige AcoustID-Antwort für {filepath.name}: {json_data}")
|
||||
return None
|
||||
recs = json_data['results'][0].get('recordings')
|
||||
if not recs:
|
||||
return None
|
||||
rec = recs[0]
|
||||
album = self._fetch_album_from_mbid(rec.get('id'))
|
||||
return rec.get('title'), rec.get('artists', [{}])[0].get('name'), album
|
||||
except requests.Timeout:
|
||||
logger.warning(f"Timeout bei AcoustID-Anfrage ({filepath.name}), nächster Versuch in {min(max_delay, delay*2):.1f}s.")
|
||||
delay = min(max_delay, delay * 2)
|
||||
tries += 1
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Netzwerkfehler bei AcoustID: {e} ({filepath.name})")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Unerwarteter Fehler bei AcoustID-Lookup: {e} ({filepath.name})")
|
||||
break
|
||||
|
||||
logger.error(f"Maximale Wiederholungsversuche für AcoustID API bei {filepath.name} erreicht – übersprungen.")
|
||||
return None
|
||||
|
||||
|
||||
def _fetch_album_from_mbid(self, mbid):
|
||||
if not mbid:
|
||||
return None
|
||||
try:
|
||||
rec = musicbrainzngs.get_recording_by_id(mbid, includes=['releases'])
|
||||
rels = rec['recording'].get('release-list', [])
|
||||
return rels[0]['title'] if rels else None
|
||||
except:
|
||||
return None
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def fetch_metadata_online(self, stem):
|
||||
try:
|
||||
res = musicbrainzngs.search_recordings(query=stem.replace('_',' '), limit=1)
|
||||
rec = res.get('recording-list',[None])[0]
|
||||
if not rec:
|
||||
return (None,None,None)
|
||||
return (rec['release-list'][0]['title'], rec['artist-credit-phrase'], rec['title'])
|
||||
except Exception as e:
|
||||
logger.error(f"MusicBrainz fallback failed: {e}")
|
||||
return (None,None,None)
|
||||
|
||||
def _prepare_destination(self, path, album):
|
||||
letter = album[0].upper() if album and album[0].isalpha() else '_'
|
||||
folder = self.target / letter / (album or 'Unknown Album')
|
||||
folder.mkdir(parents=True, exist_ok=True)
|
||||
dest = folder / path.name
|
||||
i = 1
|
||||
while dest.exists():
|
||||
base, ext = path.stem, path.suffix
|
||||
dest = folder / f"{base}_{i}{ext}"
|
||||
i += 1
|
||||
return dest
|
||||
|
||||
def organize(self):
|
||||
total = sum(1 for _,_,files in os.walk(self.source) for _ in files)
|
||||
try:
|
||||
with ThreadPoolExecutor(max_workers=self.workers) as executor:
|
||||
paths = (Path(r)/f for r,_,files in os.walk(self.source) for f in files)
|
||||
for _ in tqdm(executor.map(self.process_file, paths), total=total, desc="Verarbeite Audio-Dateien"):
|
||||
pass
|
||||
except KeyboardInterrupt:
|
||||
logger.warning("Abbruch durch Benutzer, beende...")
|
||||
finally:
|
||||
if self.duplicates:
|
||||
self.duplicates_log.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.duplicates_log.write_text("\n".join(str(p) for p in self.duplicates), encoding='utf-8')
|
||||
self.session.close()
|
||||
|
||||
if __name__=='__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('source')
|
||||
parser.add_argument('target')
|
||||
parser.add_argument('--duplicates-log', default='duplicates.txt')
|
||||
cpu_max = os.cpu_count() or 1
|
||||
parser.add_argument('--workers', type=int, choices=range(1, cpu_max+1), metavar=f'[1-{cpu_max}]')
|
||||
parser.add_argument('--verbose', action='store_true')
|
||||
parser.add_argument('--mmap', action='store_true', help='Aktiviere mmap-basiertes Hashing für große Dateien (überschreibt MMAP_ENABLED)')
|
||||
args = parser.parse_args()
|
||||
|
||||
level = logging.DEBUG if args.verbose else getattr(logging, os.getenv('LOG_LEVEL','INFO').upper(), logging.INFO)
|
||||
handler = logging.StreamHandler()
|
||||
handler.setLevel(level)
|
||||
handler.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s", "%Y-%m-%d %H:%M:%S"))
|
||||
logger.addHandler(handler)
|
||||
logger.setLevel(level)
|
||||
logger.propagate = False
|
||||
|
||||
# mmap-Option dynamisch setzen (CLI hat Vorrang vor Umgebungsvariable)
|
||||
if args.mmap:
|
||||
MMAP_ENABLED = True
|
||||
|
||||
musicbrainzngs.set_useragent(MB_APP_NAME, f"{MB_APP_VERSION} (https://example.com)", MB_APP_CONTACT)
|
||||
AudioOrganizer(args.source, args.target, args.duplicates_log, workers=args.workers).organize()
|
||||
6
Filesystem/File Organisation/requirements.txt
Normal file
6
Filesystem/File Organisation/requirements.txt
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
mutagen
|
||||
musicbrainzngs
|
||||
chromaprint
|
||||
requests
|
||||
tqdm
|
||||
urllib3
|
||||
Loading…
Add table
Reference in a new issue