mirror of
https://github.com/cgzirim/seek-tune.git
synced 2025-12-18 09:24:19 +00:00
- Fix frame calculation with proper sliding window iteration - Change hop size to windowSize/2 for 50% overlap - Return magnitude spectrum instead of complex values - Fix Peak time/frequency calculations using proper frame-based indexing - Add Hz conversion using frequency resolution - Remove incorrect frequency-based time calculations
192 lines
4.8 KiB
Go
192 lines
4.8 KiB
Go
package shazam
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"math"
|
|
"math/cmplx"
|
|
)
|
|
|
|
const (
|
|
dspRatio = 4
|
|
windowSize = 1024
|
|
maxFreq = 5000.0 // 5kHz
|
|
hopSize = windowSize / 2 // 50% overlap for better time-frequency resolution
|
|
windowType = "hanning" // choices: "hanning" or "hamming"
|
|
)
|
|
|
|
func Spectrogram(sample []float64, sampleRate int) ([][]float64, error) {
|
|
filteredSample := LowPassFilter(maxFreq, float64(sampleRate), sample)
|
|
|
|
downsampledSample, err := Downsample(filteredSample, sampleRate, sampleRate/dspRatio)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("couldn't downsample audio sample: %v", err)
|
|
}
|
|
|
|
window := make([]float64, windowSize)
|
|
for i := range window {
|
|
theta := 2 * math.Pi * float64(i) / float64(windowSize-1)
|
|
switch windowType {
|
|
case "hamming":
|
|
window[i] = 0.54 - 0.46*math.Cos(theta)
|
|
default: // Hanning window
|
|
window[i] = 0.5 - 0.5*math.Cos(theta)
|
|
}
|
|
}
|
|
|
|
// Initialize spectrogram slice
|
|
spectrogram := make([][]float64, 0)
|
|
|
|
// Perform STFT
|
|
for start := 0; start+windowSize <= len(downsampledSample); start += hopSize {
|
|
end := start + windowSize
|
|
|
|
frame := make([]float64, windowSize)
|
|
copy(frame, downsampledSample[start:end])
|
|
|
|
// Apply window
|
|
for j := range window {
|
|
frame[j] *= window[j]
|
|
}
|
|
|
|
// Perform FFT
|
|
fftResult := FFT(frame)
|
|
|
|
// Convert complex spectrum to magnitude spectrum
|
|
magnitude := make([]float64, len(fftResult)/2)
|
|
for j := range magnitude {
|
|
magnitude[j] = cmplx.Abs(fftResult[j])
|
|
}
|
|
|
|
spectrogram = append(spectrogram, magnitude)
|
|
}
|
|
|
|
return spectrogram, nil
|
|
}
|
|
|
|
// LowPassFilter is a first-order low-pass filter that attenuates high
|
|
// frequencies above the cutoffFrequency.
|
|
// It uses the transfer function H(s) = 1 / (1 + sRC), where RC is the time constant.
|
|
func LowPassFilter(cutoffFrequency, sampleRate float64, input []float64) []float64 {
|
|
rc := 1.0 / (2 * math.Pi * cutoffFrequency)
|
|
dt := 1.0 / sampleRate
|
|
alpha := dt / (rc + dt)
|
|
|
|
filteredSignal := make([]float64, len(input))
|
|
var prevOutput float64 = 0
|
|
|
|
for i, x := range input {
|
|
if i == 0 {
|
|
filteredSignal[i] = x * alpha
|
|
} else {
|
|
|
|
filteredSignal[i] = alpha*x + (1-alpha)*prevOutput
|
|
}
|
|
prevOutput = filteredSignal[i]
|
|
}
|
|
return filteredSignal
|
|
}
|
|
|
|
// Downsample downsamples the input audio from originalSampleRate to targetSampleRate
|
|
func Downsample(input []float64, originalSampleRate, targetSampleRate int) ([]float64, error) {
|
|
if targetSampleRate <= 0 || originalSampleRate <= 0 {
|
|
return nil, errors.New("sample rates must be positive")
|
|
}
|
|
if targetSampleRate > originalSampleRate {
|
|
return nil, errors.New("target sample rate must be less than or equal to original sample rate")
|
|
}
|
|
|
|
ratio := originalSampleRate / targetSampleRate
|
|
if ratio <= 0 {
|
|
return nil, errors.New("invalid ratio calculated from sample rates")
|
|
}
|
|
|
|
var resampled []float64
|
|
for i := 0; i < len(input); i += ratio {
|
|
end := i + ratio
|
|
if end > len(input) {
|
|
end = len(input)
|
|
}
|
|
|
|
sum := 0.0
|
|
for j := i; j < end; j++ {
|
|
sum += input[j]
|
|
}
|
|
avg := sum / float64(end-i)
|
|
resampled = append(resampled, avg)
|
|
}
|
|
|
|
return resampled, nil
|
|
}
|
|
|
|
// Peak represents a significant point in the spectrogram.
|
|
type Peak struct {
|
|
Freq float64 // Frequency in Hz
|
|
Time float64 // Time in seconds
|
|
}
|
|
|
|
// ExtractPeaks analyzes a spectrogram and extracts significant peaks in the frequency domain over time.
|
|
func ExtractPeaks(spectrogram [][]float64, audioDuration float64, sampleRate int) []Peak {
|
|
if len(spectrogram) < 1 {
|
|
return []Peak{}
|
|
}
|
|
|
|
type maxies struct {
|
|
maxMag float64
|
|
freqIdx int
|
|
}
|
|
|
|
bands := []struct{ min, max int }{
|
|
{0, 10}, {10, 20}, {20, 40}, {40, 80}, {80, 160}, {160, 512},
|
|
}
|
|
|
|
var peaks []Peak
|
|
frameDuration := audioDuration / float64(len(spectrogram))
|
|
|
|
// Calculate frequency resolution (Hz per bin)
|
|
effectiveSampleRate := float64(sampleRate) / float64(dspRatio)
|
|
freqResolution := effectiveSampleRate / float64(windowSize)
|
|
|
|
for frameIdx, frame := range spectrogram {
|
|
var maxMags []float64
|
|
var freqIndices []int
|
|
|
|
binBandMaxies := []maxies{}
|
|
for _, band := range bands {
|
|
var maxx maxies
|
|
var maxMag float64
|
|
for idx, mag := range frame[band.min:band.max] {
|
|
if mag > maxMag {
|
|
maxMag = mag
|
|
freqIdx := band.min + idx
|
|
maxx = maxies{mag, freqIdx}
|
|
}
|
|
}
|
|
binBandMaxies = append(binBandMaxies, maxx)
|
|
}
|
|
|
|
for _, value := range binBandMaxies {
|
|
maxMags = append(maxMags, value.maxMag)
|
|
freqIndices = append(freqIndices, value.freqIdx)
|
|
}
|
|
|
|
// Calculate the average magnitude
|
|
var maxMagsSum float64
|
|
for _, max := range maxMags {
|
|
maxMagsSum += max
|
|
}
|
|
avg := maxMagsSum / float64(len(maxMags))
|
|
|
|
// Add peaks that exceed the average magnitude
|
|
for i, value := range maxMags {
|
|
if value > avg {
|
|
peakTime := float64(frameIdx) * frameDuration
|
|
peakFreq := float64(freqIndices[i]) * freqResolution
|
|
|
|
peaks = append(peaks, Peak{Time: peakTime, Freq: peakFreq})
|
|
}
|
|
}
|
|
}
|
|
|
|
return peaks
|
|
}
|