seek-tune/server/shazam/spectrogram.go

package shazam

import (
	"errors"
	"fmt"
	"math"
	"math/cmplx"
)

const (
	dspRatio   = 4
	windowSize = 1024
	maxFreq    = 5000.0         // 5kHz
	hopSize    = windowSize / 2 // 50% overlap for better time-frequency resolution
	windowType = "hanning"      // choices: "hanning" or "hamming"
)

func Spectrogram(sample []float64, sampleRate int) ([][]float64, error) {
	filteredSample := LowPassFilter(maxFreq, float64(sampleRate), sample)

	downsampledSample, err := Downsample(filteredSample, sampleRate, sampleRate/dspRatio)
	if err != nil {
		return nil, fmt.Errorf("couldn't downsample audio sample: %v", err)
	}

	window := make([]float64, windowSize)
	for i := range window {
		theta := 2 * math.Pi * float64(i) / float64(windowSize-1)
		switch windowType {
		case "hamming":
			window[i] = 0.54 - 0.46*math.Cos(theta)
		default: // Hanning window
			window[i] = 0.5 - 0.5*math.Cos(theta)
		}
	}

	// Initialize spectrogram slice
	spectrogram := make([][]float64, 0)

	// Perform STFT
	for start := 0; start+windowSize <= len(downsampledSample); start += hopSize {
		end := start + windowSize

		frame := make([]float64, windowSize)
		copy(frame, downsampledSample[start:end])

		// Apply window
		for j := range window {
			frame[j] *= window[j]
		}

		// Perform FFT
		fftResult := FFT(frame)

		// Convert complex spectrum to magnitude spectrum
		magnitude := make([]float64, len(fftResult)/2)
		for j := range magnitude {
			magnitude[j] = cmplx.Abs(fftResult[j])
		}

		spectrogram = append(spectrogram, magnitude)
	}

	return spectrogram, nil
}

// LowPassFilter is a first-order low-pass filter that attenuates high
// frequencies above the cutoffFrequency.
// It uses the transfer function H(s) = 1 / (1 + sRC), where RC is the time constant.
func LowPassFilter(cutoffFrequency, sampleRate float64, input []float64) []float64 {
	rc := 1.0 / (2 * math.Pi * cutoffFrequency)
	dt := 1.0 / sampleRate
	alpha := dt / (rc + dt)

	filteredSignal := make([]float64, len(input))
	var prevOutput float64 = 0

	for i, x := range input {
		if i == 0 {
			filteredSignal[i] = x * alpha
		} else {

			filteredSignal[i] = alpha*x + (1-alpha)*prevOutput
		}
		prevOutput = filteredSignal[i]
	}
	return filteredSignal
}

// Downsample downsamples the input audio from originalSampleRate to targetSampleRate
func Downsample(input []float64, originalSampleRate, targetSampleRate int) ([]float64, error) {
	if targetSampleRate <= 0 || originalSampleRate <= 0 {
		return nil, errors.New("sample rates must be positive")
	}
	if targetSampleRate > originalSampleRate {
		return nil, errors.New("target sample rate must be less than or equal to original sample rate")
	}

	ratio := originalSampleRate / targetSampleRate
	if ratio <= 0 {
		return nil, errors.New("invalid ratio calculated from sample rates")
	}

	var resampled []float64
	for i := 0; i < len(input); i += ratio {
		end := i + ratio
		if end > len(input) {
			end = len(input)
		}

		sum := 0.0
		for j := i; j < end; j++ {
			sum += input[j]
		}
		avg := sum / float64(end-i)
		resampled = append(resampled, avg)
	}

	return resampled, nil
}

// Peak represents a significant point in the spectrogram.
type Peak struct {
	Freq float64 // Frequency in Hz
	Time float64 // Time in seconds
}

// ExtractPeaks analyzes a spectrogram and extracts significant peaks in the frequency domain over time.
func ExtractPeaks(spectrogram [][]float64, audioDuration float64, sampleRate int) []Peak {
	if len(spectrogram) < 1 {
		return []Peak{}
	}

	type maxies struct {
		maxMag  float64
		freqIdx int
	}

	bands := []struct{ min, max int }{
		{0, 10}, {10, 20}, {20, 40}, {40, 80}, {80, 160}, {160, 512},
	}

	var peaks []Peak
	frameDuration := audioDuration / float64(len(spectrogram))

	// Calculate frequency resolution (Hz per bin)
	effectiveSampleRate := float64(sampleRate) / float64(dspRatio)
	freqResolution := effectiveSampleRate / float64(windowSize)

	for frameIdx, frame := range spectrogram {
		var maxMags []float64
		var freqIndices []int

		binBandMaxies := []maxies{}
		for _, band := range bands {
			var maxx maxies
			var maxMag float64
			for idx, mag := range frame[band.min:band.max] {
				if mag > maxMag {
					maxMag = mag
					freqIdx := band.min + idx
					maxx = maxies{mag, freqIdx}
				}
			}
			binBandMaxies = append(binBandMaxies, maxx)
		}

		for _, value := range binBandMaxies {
			maxMags = append(maxMags, value.maxMag)
			freqIndices = append(freqIndices, value.freqIdx)
		}

		// Calculate the average magnitude
		var maxMagsSum float64
		for _, max := range maxMags {
			maxMagsSum += max
		}
		avg := maxMagsSum / float64(len(maxMags))

		// Add peaks that exceed the average magnitude
		for i, value := range maxMags {
			if value > avg {
				peakTime := float64(frameIdx) * frameDuration
				peakFreq := float64(freqIndices[i]) * freqResolution

				peaks = append(peaks, Peak{Time: peakTime, Freq: peakFreq})
			}
		}
	}

	return peaks
}