seek-tune/shazam/shazam.go
2024-04-14 23:56:55 +01:00

458 lines
12 KiB
Go

package shazam
import (
"crypto/sha256"
"encoding/binary"
"fmt"
"math"
"math/cmplx"
"math/rand"
"song-recognition/utils"
"sort"
"time"
"github.com/mjibson/go-dsp/fft"
"go.mongodb.org/mongo-driver/bson/primitive"
)
// Constants
const (
chunkSize = 4096 // 4KB
hopSize = 128
fuzzFactor = 2
bitDepth = 2
channels = 1
samplingRate = 44100
)
type ChunkTag struct {
SongTitle string
SongArtist string
YouTubeID string
TimeStamp string
}
func Match(sampleAudio []byte) ([]primitive.M, error) {
sampleChunks := Chunkify(sampleAudio)
chunkFingerprints, _ := FingerprintChunks(sampleChunks, nil)
db, err := utils.NewDbClient()
if err != nil {
return nil, err
}
defer db.Close()
var chunkTags = make(map[string]primitive.M)
var songsTimestamps = make(map[string][]string)
for _, chunkfgp := range chunkFingerprints {
listOfChunkTags, err := db.GetChunkTags(chunkfgp)
if err != nil {
return nil, err
}
for _, chunkTag := range listOfChunkTags {
timeStamp := fmt.Sprint(chunkTag["timestamp"])
songKey := fmt.Sprintf("%s by %s", chunkTag["songtitle"], chunkTag["songartist"])
if songsTimestamps[songKey] == nil {
songsTimestamps[songKey] = []string{timeStamp}
chunkTags[songKey] = chunkTag
} else {
songsTimestamps[songKey] = append(songsTimestamps[songKey], timeStamp)
}
}
}
maxMatchCount := 0
var maxMatch string
matches := make(map[string][]int)
for songKey, timestamps := range songsTimestamps {
timestampsInSeconds, err := timestampsInSeconds(timestamps)
if err != nil && err.Error() == "insufficient timestamps" {
continue
} else if err != nil {
return nil, err
}
maxPeak, differenceSum, err := getMaxPeak(timestampsInSeconds)
if err != nil {
return nil, err
}
fmt.Printf("%s MaxPeak: %v, DifferenceSum: %d\n", songKey, maxPeak, differenceSum)
fmt.Println("=====================================================\n")
differences, err := timeDifference(timestamps)
if err != nil && err.Error() == "insufficient timestamps" {
continue
} else if err != nil {
return nil, err
}
// fmt.Printf("%s DIFFERENCES: %d\n", songKey, differences)
if len(differences) >= 2 {
matches[songKey] = differences
if len(differences) > maxMatchCount {
maxMatchCount = len(differences)
maxMatch = songKey
}
}
}
sortedChunkTags := sortMatchesByTimeDifference(matches, chunkTags)
// fmt.Println("SORTED CHUNK TAGS: ", sortedChunkTags)
// fmt.Println("MATCHES: ", matches)
fmt.Println("MATCH: ", maxMatch)
// fmt.Println()
return sortedChunkTags, nil
}
func sortMatchesByTimeDifference(matches map[string][]int, chunkTags map[string]primitive.M) []primitive.M {
type songDifferences struct {
songKey string
differences []int
sum int
}
var kvPairs []songDifferences
for songKey, differences := range matches {
sum := 0
for _, difference := range differences {
sum += difference
}
kvPairs = append(kvPairs, songDifferences{songKey, differences, sum})
}
sort.Slice(kvPairs, func(i, j int) bool {
return kvPairs[i].sum > kvPairs[j].sum
})
var sortedChunkTags []primitive.M
for _, pair := range kvPairs {
sortedChunkTags = append(sortedChunkTags, chunkTags[pair.songKey])
}
return sortedChunkTags
}
func timestampsInSeconds(timestamps []string) ([]int, error) {
layout := "15:04:05"
timestampsInSeconds := make([]int, len(timestamps))
for i, ts := range timestamps {
parsedTime, err := time.Parse(layout, ts)
if err != nil {
return nil, fmt.Errorf("error parsing timestamp %q: %w", ts, err)
}
hours := parsedTime.Hour()
minutes := parsedTime.Minute()
seconds := parsedTime.Second()
timestampsInSeconds[i] = (hours * 3600) + (minutes * 60) + seconds
}
return timestampsInSeconds, nil
}
// getMaxPeak identifies clusters of timestamps (peaks) within a sequence where the differences between adjacent timestamps
// are below a certain threshold. It returns the largest peak, the sum of differences within that peak, and an error if any.
func getMaxPeak(timestamps []int) ([]int, int, error) {
if len(timestamps) < 2 {
return nil, 0, fmt.Errorf("insufficient timestamps")
}
var peaks [][]int
maxDifference := 15
var cluster []int
// Iterate over timestamps to identify peaks
for i := 0; i < len(timestamps)-1; i++ {
minuend, subtrahend := timestamps[i], timestamps[i+1]
// Ensure timestamps are in ascending order
if minuend > subtrahend {
peaks = append(peaks, cluster)
cluster = nil
continue
}
difference := int(math.Abs(float64(minuend - subtrahend)))
// Check if the difference is within the maximum allowed difference
if len(cluster) == 0 && difference <= maxDifference {
cluster = append(cluster, minuend, subtrahend)
} else if difference <= maxDifference {
cluster = append(cluster, subtrahend)
} else if difference > maxDifference {
peaks = append(peaks, cluster)
cluster = nil
}
}
// Identify the largest peak(s)
largestPeak := [][]int{peaks[0]}
for _, peak := range peaks[1:] {
if len(peak) == len(largestPeak[0]) {
largestPeak = append(largestPeak, peak)
} else if len(peak) > len(largestPeak[0]) {
largestPeak = nil
largestPeak = append(largestPeak, peak)
}
}
// In the case where there are multiple largest peaks,
// identify and return the largest peak with the smallest sum of differences
if len(largestPeak) > 1 {
fmt.Println("Largest Peak > 1: ", largestPeak)
// Deduplicate largest peaks in order to get accurate sum of difference
var largestPeakDeDuplicated [][]int
for _, peak := range largestPeak {
largestPeakDeDuplicated = append(largestPeakDeDuplicated, deduplicate(peak))
}
fmt.Println("Largest Peak deduplicated: ", largestPeakDeDuplicated)
minDifferenceSum := math.Inf(1)
var peakWithMinDifferenceSum []int
for idx, peak := range largestPeakDeDuplicated {
if len(peak) <= 1 {
continue
}
differenceSum := 0.0
for i := len(peak) - 1; i >= 1; i-- {
differenceSum += math.Abs(float64(peak[i] - peak[i-1]))
}
if differenceSum < minDifferenceSum {
minDifferenceSum = differenceSum
fmt.Printf("%v vs %v\n", largestPeak[idx], peak)
peakWithMinDifferenceSum = largestPeak[idx]
}
}
// In the case where no peak with the min difference sum was identified,
// probably because they were all duplicates, return the first from the largestspeaks
if len(peakWithMinDifferenceSum) == 0 {
peakWithMinDifferenceSum = largestPeak[0]
}
return peakWithMinDifferenceSum, int(minDifferenceSum), nil
}
// Otherwise, return the largest peak
maxPeak := largestPeak[0]
differenceSum := 0
for i := len(maxPeak) - 1; i >= 1; i-- {
differenceSum += maxPeak[i] - maxPeak[i-1]
}
return maxPeak, differenceSum, nil
}
func timeDifference(timestamps []string) ([]int, error) {
if len(timestamps) < 2 {
return nil, fmt.Errorf("insufficient timestamps")
}
layout := "15:04:05"
timestampsInSeconds := make([]int, len(timestamps))
for i, ts := range timestamps {
parsedTime, err := time.Parse(layout, ts)
if err != nil {
return nil, fmt.Errorf("error parsing timestamp %q: %w", ts, err)
}
hours := parsedTime.Hour()
minutes := parsedTime.Minute()
seconds := parsedTime.Second()
timestampsInSeconds[i] = (hours * 3600) + (minutes * 60) + seconds
}
// sort.Ints(timestampsInSeconds)
differencesSet := map[int]struct{}{}
var differences []int
for i := len(timestampsInSeconds) - 1; i >= 1; i-- {
difference := timestampsInSeconds[i] - timestampsInSeconds[i-1]
// maxSeconds = 15
if difference > 0 && difference <= 15 {
differencesSet[difference] = struct{}{}
differences = append(differences, difference)
}
}
differencesList := []int{}
if len(differencesSet) > 0 {
for k := range differencesSet {
differencesList = append(differencesList, k)
}
}
return timestampsInSeconds, nil
}
// Chunkify divides the input audio signal into chunks and calculates the Short-Time Fourier Transform (STFT) for each chunk.
// The function returns a 2D slice containing the STFT coefficients for each chunk.
func Chunkify(audio []byte) [][]complex128 {
numWindows := len(audio) / (chunkSize - hopSize)
chunks := make([][]complex128, numWindows)
// Apply Hamming window function
window := make([]float64, chunkSize)
for i := range window {
window[i] = 0.54 - 0.46*math.Cos(2*math.Pi*float64(i)/float64(chunkSize-1))
}
// Perform STFT
for i := 0; i < numWindows; i++ {
// Extract current chunk
start := i * hopSize
end := start + chunkSize
if end > len(audio) {
end = len(audio)
}
chunk := make([]complex128, chunkSize)
for j := start; j < end; j++ {
chunk[j-start] = complex(float64(audio[j])*window[j-start], 0)
}
// Compute FFT
// chunks[i] = Fft(chunk)
chunks[i] = fft.FFT(chunk)
}
return chunks
}
// FingerprintChunks processes a collection of audio data represented as chunks of complex numbers and
// generates fingerprints for each chunk based on the magnitude of frequency components within specific frequency ranges.
func FingerprintChunks(chunks [][]complex128, chunkTag *ChunkTag) ([]int64, map[int64]ChunkTag) {
var fingerprintList []int64
fingerprintMap := make(map[int64]ChunkTag)
var chunksPerSecond int
var chunkCount int
var chunkTime time.Time
if chunkTag != nil {
// bytesPerSecond = (samplingRate * bitDepth * channels) / 8
chunksPerSecond = (chunkSize - hopSize) / samplingRate
chunksPerSecond = len(chunks)
fmt.Println("CHUNKS PER SECOND: ", chunksPerSecond)
chunksPerSecond = 3
fmt.Println("CHUNKS PER SECOND: ", chunksPerSecond)
// if chunkSize == 4096 {
// chunksPerSecond = 10
// }
chunkCount = 0
chunkTime = time.Date(1, 1, 1, 0, 0, 0, 0, time.UTC)
}
for _, chunk := range chunks {
if chunkTag != nil {
chunkCount++
if chunkCount == chunksPerSecond {
chunkCount = 0
chunkTime = chunkTime.Add(1 * time.Second)
fmt.Println(chunkTime.Format("15:04:05"))
}
}
chunkMags := map[string]int{
"20-60": 0, "60-250": 0, "250-500": 0,
"500-2000": 0, "2000-4000": 0, "4000-8000": 0, "8000-20000": 0,
}
for _, frequency := range chunk {
magnitude := int(cmplx.Abs(frequency))
ranges := []struct{ min, max int }{{20, 60}, {60, 250}, {250, 500}, {500, 2000}, {2000, 4000}, {4000, 8000}, {8000, 20001}}
for _, r := range ranges {
if magnitude >= r.min && magnitude < r.max &&
chunkMags[fmt.Sprintf("%d-%d", r.min, r.max)] < magnitude {
chunkMags[fmt.Sprintf("%d-%d", r.min, r.max)] = magnitude
}
}
}
// fingerprint := fmt.Sprintf("%d-%d-%d-%d-%d-%d-%d",
// chunkMags["20-60"],
// chunkMags["60-250"],
// chunkMags["250-500"],
// chunkMags["500-2000"],
// chunkMags["2000-4000"],
// chunkMags["4000-8000"],
// chunkMags["8000-20000"])
// fmt.Println(fingerprint)
points := [4]int64{
int64(chunkMags["60-250"]),
int64(chunkMags["250-500"]),
int64(chunkMags["500-2000"]),
int64(chunkMags["2000-4000"])}
// key := hash1(points[:])
// fmt.Printf("%s: %v\n", fingerprint, key)
// points := [6]int64{
// int64(chunkMags["20-60"]),
// int64(chunkMags["60-250"]),
// int64(chunkMags["250-500"]),
// int64(chunkMags["500-2000"]),
// int64(chunkMags["2000-4000"]),
// int64(chunkMags["4000-8000"])}
key := hash(points[:])
if chunkTag != nil {
newSampleTag := *chunkTag
newSampleTag.TimeStamp = chunkTime.Format("15:04:05")
fingerprintMap[key] = newSampleTag
} else {
fingerprintList = append(fingerprintList, key)
}
}
return fingerprintList, fingerprintMap
}
func hash(values []int64) int64 {
weight := 100
var result int64
for _, value := range values {
result += (value - (value % fuzzFactor)) * int64(weight)
weight = weight * weight
}
return result
}
func hash1(values []int64) int64 {
p1, p2, p3, p4 := values[0], values[1], values[2], values[3]
return (p4-(p4%fuzzFactor))*100000000 +
(p3-(p3%fuzzFactor))*100000 +
(p2-(p2%fuzzFactor))*100 +
(p1 - (p1 % fuzzFactor))
}
func hash2(values []int64) int64 {
for i := range values {
values[i] += rand.Int63n(fuzzFactor) - fuzzFactor/2
}
var buf []byte
for _, v := range values {
b := make([]byte, 8)
binary.LittleEndian.PutUint64(b, uint64(v))
buf = append(buf, b...)
}
hash := sha256.Sum256(buf)
return int64(binary.BigEndian.Uint64(hash[:8]))
}