mirror of
https://github.com/cgzirim/seek-tune.git
synced 2025-12-16 16:34:21 +00:00
fix(spectrogram): correct STFT and peak extraction algorithm
- Fix frame calculation with proper sliding window iteration - Change hop size to windowSize/2 for 50% overlap - Return magnitude spectrum instead of complex values - Fix Peak time/frequency calculations using proper frame-based indexing - Add Hz conversion using frequency resolution - Remove incorrect frequency-based time calculations
This commit is contained in:
parent
6f1111eb35
commit
e3a35ef1eb
1 changed files with 56 additions and 43 deletions
|
|
@ -8,13 +8,14 @@ import (
|
|||
)
|
||||
|
||||
const (
|
||||
dspRatio = 4
|
||||
freqBinSize = 1024
|
||||
maxFreq = 5000.0 // 5kHz
|
||||
hopSize = freqBinSize / 32
|
||||
dspRatio = 4
|
||||
windowSize = 1024
|
||||
maxFreq = 5000.0 // 5kHz
|
||||
hopSize = windowSize / 2 // 50% overlap for better time-frequency resolution
|
||||
windowType = "hanning" // choices: "hanning" or "hamming"
|
||||
)
|
||||
|
||||
func Spectrogram(sample []float64, sampleRate int) ([][]complex128, error) {
|
||||
func Spectrogram(sample []float64, sampleRate int) ([][]float64, error) {
|
||||
filteredSample := LowPassFilter(maxFreq, float64(sampleRate), sample)
|
||||
|
||||
downsampledSample, err := Downsample(filteredSample, sampleRate, sampleRate/dspRatio)
|
||||
|
|
@ -22,31 +23,42 @@ func Spectrogram(sample []float64, sampleRate int) ([][]complex128, error) {
|
|||
return nil, fmt.Errorf("couldn't downsample audio sample: %v", err)
|
||||
}
|
||||
|
||||
numOfWindows := len(downsampledSample) / (freqBinSize - hopSize)
|
||||
spectrogram := make([][]complex128, numOfWindows)
|
||||
|
||||
window := make([]float64, freqBinSize)
|
||||
window := make([]float64, windowSize)
|
||||
for i := range window {
|
||||
window[i] = 0.54 - 0.46*math.Cos(2*math.Pi*float64(i)/(float64(freqBinSize)-1))
|
||||
theta := 2 * math.Pi * float64(i) / float64(windowSize-1)
|
||||
switch windowType {
|
||||
case "hamming":
|
||||
window[i] = 0.54 - 0.46*math.Cos(theta)
|
||||
default: // Hanning window
|
||||
window[i] = 0.5 - 0.5*math.Cos(theta)
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize spectrogram slice
|
||||
spectrogram := make([][]float64, 0)
|
||||
|
||||
// Perform STFT
|
||||
for i := 0; i < numOfWindows; i++ {
|
||||
start := i * hopSize
|
||||
end := start + freqBinSize
|
||||
if end > len(downsampledSample) {
|
||||
end = len(downsampledSample)
|
||||
}
|
||||
for start := 0; start+windowSize <= len(downsampledSample); start += hopSize {
|
||||
end := start + windowSize
|
||||
|
||||
bin := make([]float64, freqBinSize)
|
||||
copy(bin, downsampledSample[start:end])
|
||||
frame := make([]float64, windowSize)
|
||||
copy(frame, downsampledSample[start:end])
|
||||
|
||||
// Apply Hamming window
|
||||
// Apply window
|
||||
for j := range window {
|
||||
bin[j] *= window[j]
|
||||
frame[j] *= window[j]
|
||||
}
|
||||
|
||||
spectrogram[i] = FFT(bin)
|
||||
// Perform FFT
|
||||
fftResult := FFT(frame)
|
||||
|
||||
// Convert complex spectrum to magnitude spectrum
|
||||
magnitude := make([]float64, len(fftResult)/2)
|
||||
for j := range magnitude {
|
||||
magnitude[j] = cmplx.Abs(fftResult[j])
|
||||
}
|
||||
|
||||
spectrogram = append(spectrogram, magnitude)
|
||||
}
|
||||
|
||||
return spectrogram, nil
|
||||
|
|
@ -107,43 +119,47 @@ func Downsample(input []float64, originalSampleRate, targetSampleRate int) ([]fl
|
|||
return resampled, nil
|
||||
}
|
||||
|
||||
// Peak represents a significant point in the spectrogram.
|
||||
type Peak struct {
|
||||
Time float64
|
||||
Freq complex128
|
||||
Freq float64 // Frequency in Hz
|
||||
Time float64 // Time in seconds
|
||||
}
|
||||
|
||||
// ExtractPeaks analyzes a spectrogram and extracts significant peaks in the frequency domain over time.
|
||||
func ExtractPeaks(spectrogram [][]complex128, audioDuration float64) []Peak {
|
||||
func ExtractPeaks(spectrogram [][]float64, audioDuration float64, sampleRate int) []Peak {
|
||||
if len(spectrogram) < 1 {
|
||||
return []Peak{}
|
||||
}
|
||||
|
||||
type maxies struct {
|
||||
maxMag float64
|
||||
maxFreq complex128
|
||||
freqIdx int
|
||||
}
|
||||
|
||||
bands := []struct{ min, max int }{{0, 10}, {10, 20}, {20, 40}, {40, 80}, {80, 160}, {160, 512}}
|
||||
bands := []struct{ min, max int }{
|
||||
{0, 10}, {10, 20}, {20, 40}, {40, 80}, {80, 160}, {160, 512},
|
||||
}
|
||||
|
||||
var peaks []Peak
|
||||
binDuration := audioDuration / float64(len(spectrogram))
|
||||
frameDuration := audioDuration / float64(len(spectrogram))
|
||||
|
||||
for binIdx, bin := range spectrogram {
|
||||
// Calculate frequency resolution (Hz per bin)
|
||||
effectiveSampleRate := float64(sampleRate) / float64(dspRatio)
|
||||
freqResolution := effectiveSampleRate / float64(windowSize)
|
||||
|
||||
for frameIdx, frame := range spectrogram {
|
||||
var maxMags []float64
|
||||
var maxFreqs []complex128
|
||||
var freqIndices []float64
|
||||
var freqIndices []int
|
||||
|
||||
binBandMaxies := []maxies{}
|
||||
for _, band := range bands {
|
||||
var maxx maxies
|
||||
var maxMag float64
|
||||
for idx, freq := range bin[band.min:band.max] {
|
||||
magnitude := cmplx.Abs(freq)
|
||||
if magnitude > maxMag {
|
||||
maxMag = magnitude
|
||||
for idx, mag := range frame[band.min:band.max] {
|
||||
if mag > maxMag {
|
||||
maxMag = mag
|
||||
freqIdx := band.min + idx
|
||||
maxx = maxies{magnitude, freq, freqIdx}
|
||||
maxx = maxies{mag, freqIdx}
|
||||
}
|
||||
}
|
||||
binBandMaxies = append(binBandMaxies, maxx)
|
||||
|
|
@ -151,8 +167,7 @@ func ExtractPeaks(spectrogram [][]complex128, audioDuration float64) []Peak {
|
|||
|
||||
for _, value := range binBandMaxies {
|
||||
maxMags = append(maxMags, value.maxMag)
|
||||
maxFreqs = append(maxFreqs, value.maxFreq)
|
||||
freqIndices = append(freqIndices, float64(value.freqIdx))
|
||||
freqIndices = append(freqIndices, value.freqIdx)
|
||||
}
|
||||
|
||||
// Calculate the average magnitude
|
||||
|
|
@ -160,17 +175,15 @@ func ExtractPeaks(spectrogram [][]complex128, audioDuration float64) []Peak {
|
|||
for _, max := range maxMags {
|
||||
maxMagsSum += max
|
||||
}
|
||||
avg := maxMagsSum / float64(len(maxFreqs)) // * coefficient
|
||||
avg := maxMagsSum / float64(len(maxMags))
|
||||
|
||||
// Add peaks that exceed the average magnitude
|
||||
for i, value := range maxMags {
|
||||
if value > avg {
|
||||
peakTimeInBin := freqIndices[i] * binDuration / float64(len(bin))
|
||||
peakTime := float64(frameIdx) * frameDuration
|
||||
peakFreq := float64(freqIndices[i]) * freqResolution
|
||||
|
||||
// Calculate the absolute time of the peak
|
||||
peakTime := float64(binIdx)*binDuration + peakTimeInBin
|
||||
|
||||
peaks = append(peaks, Peak{Time: peakTime, Freq: maxFreqs[i]})
|
||||
peaks = append(peaks, Peak{Time: peakTime, Freq: peakFreq})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue