fix(spectrogram): correct STFT and peak extraction algorithm

- Fix frame calculation with proper sliding window iteration
- Change hop size to windowSize/2 for 50% overlap
- Return magnitude spectrum instead of complex values
- Fix Peak time/frequency calculations using proper frame-based indexing
- Add Hz conversion using frequency resolution
- Remove incorrect frequency-based time calculations
This commit is contained in:
Chigozirim Igweamaka 2025-11-19 16:47:01 +01:00
parent 6f1111eb35
commit e3a35ef1eb

View file

@ -8,13 +8,14 @@ import (
) )
const ( const (
dspRatio = 4 dspRatio = 4
freqBinSize = 1024 windowSize = 1024
maxFreq = 5000.0 // 5kHz maxFreq = 5000.0 // 5kHz
hopSize = freqBinSize / 32 hopSize = windowSize / 2 // 50% overlap for better time-frequency resolution
windowType = "hanning" // choices: "hanning" or "hamming"
) )
func Spectrogram(sample []float64, sampleRate int) ([][]complex128, error) { func Spectrogram(sample []float64, sampleRate int) ([][]float64, error) {
filteredSample := LowPassFilter(maxFreq, float64(sampleRate), sample) filteredSample := LowPassFilter(maxFreq, float64(sampleRate), sample)
downsampledSample, err := Downsample(filteredSample, sampleRate, sampleRate/dspRatio) downsampledSample, err := Downsample(filteredSample, sampleRate, sampleRate/dspRatio)
@ -22,31 +23,42 @@ func Spectrogram(sample []float64, sampleRate int) ([][]complex128, error) {
return nil, fmt.Errorf("couldn't downsample audio sample: %v", err) return nil, fmt.Errorf("couldn't downsample audio sample: %v", err)
} }
numOfWindows := len(downsampledSample) / (freqBinSize - hopSize) window := make([]float64, windowSize)
spectrogram := make([][]complex128, numOfWindows)
window := make([]float64, freqBinSize)
for i := range window { for i := range window {
window[i] = 0.54 - 0.46*math.Cos(2*math.Pi*float64(i)/(float64(freqBinSize)-1)) theta := 2 * math.Pi * float64(i) / float64(windowSize-1)
switch windowType {
case "hamming":
window[i] = 0.54 - 0.46*math.Cos(theta)
default: // Hanning window
window[i] = 0.5 - 0.5*math.Cos(theta)
}
} }
// Initialize spectrogram slice
spectrogram := make([][]float64, 0)
// Perform STFT // Perform STFT
for i := 0; i < numOfWindows; i++ { for start := 0; start+windowSize <= len(downsampledSample); start += hopSize {
start := i * hopSize end := start + windowSize
end := start + freqBinSize
if end > len(downsampledSample) {
end = len(downsampledSample)
}
bin := make([]float64, freqBinSize) frame := make([]float64, windowSize)
copy(bin, downsampledSample[start:end]) copy(frame, downsampledSample[start:end])
// Apply Hamming window // Apply window
for j := range window { for j := range window {
bin[j] *= window[j] frame[j] *= window[j]
} }
spectrogram[i] = FFT(bin) // Perform FFT
fftResult := FFT(frame)
// Convert complex spectrum to magnitude spectrum
magnitude := make([]float64, len(fftResult)/2)
for j := range magnitude {
magnitude[j] = cmplx.Abs(fftResult[j])
}
spectrogram = append(spectrogram, magnitude)
} }
return spectrogram, nil return spectrogram, nil
@ -107,43 +119,47 @@ func Downsample(input []float64, originalSampleRate, targetSampleRate int) ([]fl
return resampled, nil return resampled, nil
} }
// Peak represents a significant point in the spectrogram.
type Peak struct { type Peak struct {
Time float64 Freq float64 // Frequency in Hz
Freq complex128 Time float64 // Time in seconds
} }
// ExtractPeaks analyzes a spectrogram and extracts significant peaks in the frequency domain over time. // ExtractPeaks analyzes a spectrogram and extracts significant peaks in the frequency domain over time.
func ExtractPeaks(spectrogram [][]complex128, audioDuration float64) []Peak { func ExtractPeaks(spectrogram [][]float64, audioDuration float64, sampleRate int) []Peak {
if len(spectrogram) < 1 { if len(spectrogram) < 1 {
return []Peak{} return []Peak{}
} }
type maxies struct { type maxies struct {
maxMag float64 maxMag float64
maxFreq complex128
freqIdx int freqIdx int
} }
bands := []struct{ min, max int }{{0, 10}, {10, 20}, {20, 40}, {40, 80}, {80, 160}, {160, 512}} bands := []struct{ min, max int }{
{0, 10}, {10, 20}, {20, 40}, {40, 80}, {80, 160}, {160, 512},
}
var peaks []Peak var peaks []Peak
binDuration := audioDuration / float64(len(spectrogram)) frameDuration := audioDuration / float64(len(spectrogram))
for binIdx, bin := range spectrogram { // Calculate frequency resolution (Hz per bin)
effectiveSampleRate := float64(sampleRate) / float64(dspRatio)
freqResolution := effectiveSampleRate / float64(windowSize)
for frameIdx, frame := range spectrogram {
var maxMags []float64 var maxMags []float64
var maxFreqs []complex128 var freqIndices []int
var freqIndices []float64
binBandMaxies := []maxies{} binBandMaxies := []maxies{}
for _, band := range bands { for _, band := range bands {
var maxx maxies var maxx maxies
var maxMag float64 var maxMag float64
for idx, freq := range bin[band.min:band.max] { for idx, mag := range frame[band.min:band.max] {
magnitude := cmplx.Abs(freq) if mag > maxMag {
if magnitude > maxMag { maxMag = mag
maxMag = magnitude
freqIdx := band.min + idx freqIdx := band.min + idx
maxx = maxies{magnitude, freq, freqIdx} maxx = maxies{mag, freqIdx}
} }
} }
binBandMaxies = append(binBandMaxies, maxx) binBandMaxies = append(binBandMaxies, maxx)
@ -151,8 +167,7 @@ func ExtractPeaks(spectrogram [][]complex128, audioDuration float64) []Peak {
for _, value := range binBandMaxies { for _, value := range binBandMaxies {
maxMags = append(maxMags, value.maxMag) maxMags = append(maxMags, value.maxMag)
maxFreqs = append(maxFreqs, value.maxFreq) freqIndices = append(freqIndices, value.freqIdx)
freqIndices = append(freqIndices, float64(value.freqIdx))
} }
// Calculate the average magnitude // Calculate the average magnitude
@ -160,17 +175,15 @@ func ExtractPeaks(spectrogram [][]complex128, audioDuration float64) []Peak {
for _, max := range maxMags { for _, max := range maxMags {
maxMagsSum += max maxMagsSum += max
} }
avg := maxMagsSum / float64(len(maxFreqs)) // * coefficient avg := maxMagsSum / float64(len(maxMags))
// Add peaks that exceed the average magnitude // Add peaks that exceed the average magnitude
for i, value := range maxMags { for i, value := range maxMags {
if value > avg { if value > avg {
peakTimeInBin := freqIndices[i] * binDuration / float64(len(bin)) peakTime := float64(frameIdx) * frameDuration
peakFreq := float64(freqIndices[i]) * freqResolution
// Calculate the absolute time of the peak peaks = append(peaks, Peak{Time: peakTime, Freq: peakFreq})
peakTime := float64(binIdx)*binDuration + peakTimeInBin
peaks = append(peaks, Peak{Time: peakTime, Freq: maxFreqs[i]})
} }
} }
} }