mirror of
https://github.com/cgzirim/seek-tune.git
synced 2025-12-18 09:24:19 +00:00
302 lines
8.3 KiB
Go
302 lines
8.3 KiB
Go
package wav
|
||
|
||
import (
|
||
"bytes"
|
||
"context"
|
||
"encoding/base64"
|
||
"encoding/binary"
|
||
"encoding/json"
|
||
"errors"
|
||
"fmt"
|
||
"io/ioutil"
|
||
"log/slog"
|
||
"os"
|
||
"os/exec"
|
||
"song-recognition/models"
|
||
"song-recognition/utils"
|
||
"strings"
|
||
"time"
|
||
|
||
"github.com/mdobak/go-xerrors"
|
||
)
|
||
|
||
// WavHeader defines the structure of a WAV header
|
||
type WavHeader struct {
|
||
ChunkID [4]byte
|
||
ChunkSize uint32
|
||
Format [4]byte
|
||
Subchunk1ID [4]byte
|
||
Subchunk1Size uint32
|
||
AudioFormat uint16
|
||
NumChannels uint16
|
||
SampleRate uint32
|
||
BytesPerSec uint32
|
||
BlockAlign uint16
|
||
BitsPerSample uint16
|
||
Subchunk2ID [4]byte
|
||
Subchunk2Size uint32
|
||
}
|
||
|
||
func writeWavHeader(f *os.File, data []byte, sampleRate int, channels int, bitsPerSample int) error {
|
||
// Validate input
|
||
if len(data)%channels != 0 {
|
||
return errors.New("data size not divisible by channels")
|
||
}
|
||
|
||
// Calculate derived values
|
||
subchunk1Size := uint32(16) // Assuming PCM format
|
||
bytesPerSample := bitsPerSample / 8
|
||
blockAlign := uint16(channels * bytesPerSample)
|
||
subchunk2Size := uint32(len(data))
|
||
|
||
// Build WAV header
|
||
header := WavHeader{
|
||
ChunkID: [4]byte{'R', 'I', 'F', 'F'},
|
||
ChunkSize: uint32(36 + len(data)),
|
||
Format: [4]byte{'W', 'A', 'V', 'E'},
|
||
Subchunk1ID: [4]byte{'f', 'm', 't', ' '},
|
||
Subchunk1Size: subchunk1Size,
|
||
AudioFormat: uint16(1), // PCM format
|
||
NumChannels: uint16(channels),
|
||
SampleRate: uint32(sampleRate),
|
||
BytesPerSec: uint32(sampleRate * channels * bytesPerSample),
|
||
BlockAlign: blockAlign,
|
||
BitsPerSample: uint16(bitsPerSample),
|
||
Subchunk2ID: [4]byte{'d', 'a', 't', 'a'},
|
||
Subchunk2Size: subchunk2Size,
|
||
}
|
||
|
||
// Write header to file
|
||
err := binary.Write(f, binary.LittleEndian, header)
|
||
return err
|
||
}
|
||
|
||
func WriteWavFile(filename string, data []byte, sampleRate int, channels int, bitsPerSample int) error {
|
||
f, err := os.Create(filename)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
defer f.Close()
|
||
|
||
if sampleRate <= 0 || channels <= 0 || bitsPerSample <= 0 {
|
||
return fmt.Errorf(
|
||
"values must be greater than zero (sampleRate: %d, channels: %d, bitsPerSample: %d)",
|
||
sampleRate, channels, bitsPerSample,
|
||
)
|
||
}
|
||
|
||
err = writeWavHeader(f, data, sampleRate, channels, bitsPerSample)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
|
||
_, err = f.Write(data)
|
||
return err
|
||
}
|
||
|
||
type WavInfo struct {
|
||
Channels int
|
||
SampleRate int
|
||
Duration float64
|
||
Data []byte
|
||
LeftChannelSamples []float64
|
||
RightChannelSamples []float64
|
||
}
|
||
|
||
// ReadWavInfo reads a 16-bit PCM WAV file and returns its metadata and audio samples.
|
||
// Supports mono and stereo files. Note that it only supports 16-bit PCM format.
|
||
func ReadWavInfo(filename string) (*WavInfo, error) {
|
||
data, err := ioutil.ReadFile(filename)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if len(data) < 44 {
|
||
return nil, errors.New("invalid WAV file size (too small)")
|
||
}
|
||
|
||
// Parse PCM header to extract metadata
|
||
// https://en.wikipedia.org/wiki/WAV#WAV_file_header
|
||
var header WavHeader
|
||
if err := binary.Read(bytes.NewReader(data[:44]), binary.LittleEndian, &header); err != nil {
|
||
return nil, err
|
||
}
|
||
if string(header.ChunkID[:]) != "RIFF" ||
|
||
string(header.Format[:]) != "WAVE" ||
|
||
header.AudioFormat != 1 {
|
||
return nil, errors.New("invalid WAV header format")
|
||
}
|
||
|
||
info := &WavInfo{
|
||
Channels: int(header.NumChannels),
|
||
SampleRate: int(header.SampleRate),
|
||
Data: data[44:],
|
||
}
|
||
|
||
if header.BitsPerSample != 16 {
|
||
return nil, errors.New("unsupported bits‑per‑sample (expect 16‑bit PCM)")
|
||
}
|
||
|
||
sampleCount := len(info.Data) / 2
|
||
int16Buf := make([]int16, sampleCount)
|
||
if err := binary.Read(bytes.NewReader(info.Data), binary.LittleEndian, int16Buf); err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
const scale = 1.0 / 32768.0 // 16‑bit normalisation factor
|
||
|
||
switch header.NumChannels {
|
||
case 1:
|
||
left := make([]float64, sampleCount)
|
||
for i, s := range int16Buf {
|
||
left[i] = float64(s) * scale
|
||
}
|
||
info.LeftChannelSamples = left
|
||
|
||
case 2:
|
||
frameCount := sampleCount / 2
|
||
left := make([]float64, frameCount)
|
||
right := make([]float64, frameCount)
|
||
for i := 0; i < frameCount; i++ {
|
||
left[i] = float64(int16Buf[2*i]) * scale
|
||
right[i] = float64(int16Buf[2*i+1]) * scale
|
||
}
|
||
info.LeftChannelSamples = left
|
||
info.RightChannelSamples = right
|
||
|
||
default:
|
||
return nil, errors.New("unsupported channel count (only mono/stereo)")
|
||
}
|
||
|
||
// Compute audio duration in seconds
|
||
info.Duration = float64(sampleCount) /
|
||
(float64(header.NumChannels) * float64(header.SampleRate))
|
||
|
||
return info, nil
|
||
}
|
||
|
||
// WavBytesToFloat64 converts a slice of bytes from a .wav file to a slice of float64 samples
|
||
func WavBytesToSamples(input []byte) ([]float64, error) {
|
||
if len(input)%2 != 0 {
|
||
return nil, errors.New("invalid input length")
|
||
}
|
||
|
||
numSamples := len(input) / 2
|
||
output := make([]float64, numSamples)
|
||
|
||
for i := 0; i < len(input); i += 2 {
|
||
// Interpret bytes as a 16-bit signed integer (little-endian)
|
||
sample := int16(binary.LittleEndian.Uint16(input[i : i+2]))
|
||
|
||
// Scale the sample to the range [-1, 1]
|
||
output[i/2] = float64(sample) / 32768.0
|
||
}
|
||
|
||
return output, nil
|
||
}
|
||
|
||
// FFmpegMetadata represents the metadata structure returned by ffprobe.
|
||
type FFmpegMetadata struct {
|
||
Streams []struct {
|
||
Index int `json:"index"`
|
||
CodecName string `json:"codec_name"`
|
||
CodecLongName string `json:"codec_long_name"`
|
||
CodecType string `json:"codec_type"`
|
||
SampleFmt string `json:"sample_fmt"`
|
||
SampleRate string `json:"sample_rate"`
|
||
Channels int `json:"channels"`
|
||
ChannelLayout string `json:"channel_layout"`
|
||
BitsPerSample int `json:"bits_per_sample"`
|
||
Duration string `json:"duration"`
|
||
BitRate string `json:"bit_rate"`
|
||
Disposition map[string]int `json:"disposition"`
|
||
Tags map[string]string `json:"tags"`
|
||
} `json:"streams"`
|
||
Format struct {
|
||
Streams int `json:"nb_streams"`
|
||
FormFilename string `json:"filename"`
|
||
NbatName string `json:"format_name"`
|
||
FormatLongName string `json:"format_long_name"`
|
||
StartTime string `json:"start_time"`
|
||
Duration string `json:"duration"`
|
||
Size string `json:"size"`
|
||
BitRate string `json:"bit_rate"`
|
||
Tags map[string]string `json:"tags"`
|
||
} `json:"format"`
|
||
}
|
||
|
||
// GetMetadata retrieves metadata from a file using ffprobe.
|
||
func GetMetadata(filePath string) (FFmpegMetadata, error) {
|
||
var metadata FFmpegMetadata
|
||
|
||
cmd := exec.Command("ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", "-show_streams", filePath)
|
||
var out bytes.Buffer
|
||
cmd.Stdout = &out
|
||
err := cmd.Run()
|
||
if err != nil {
|
||
return metadata, err
|
||
}
|
||
|
||
err = json.Unmarshal(out.Bytes(), &metadata)
|
||
if err != nil {
|
||
return metadata, err
|
||
}
|
||
|
||
// convert all keys of the Tags map to lowercase
|
||
for k, v := range metadata.Format.Tags {
|
||
metadata.Format.Tags[strings.ToLower(k)] = v
|
||
}
|
||
for k, v := range metadata.Streams[0].Tags {
|
||
metadata.Streams[0].Tags[strings.ToLower(k)] = v
|
||
}
|
||
|
||
return metadata, nil
|
||
}
|
||
|
||
func ProcessRecording(recData *models.RecordData, saveRecording bool) ([]float64, error) {
|
||
decodedAudioData, err := base64.StdEncoding.DecodeString(recData.Audio)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
now := time.Now()
|
||
fileName := fmt.Sprintf("%04d_%02d_%02d_%02d_%02d_%02d.wav",
|
||
now.Second(), now.Minute(), now.Hour(),
|
||
now.Day(), now.Month(), now.Year(),
|
||
)
|
||
filePath := "tmp/" + fileName
|
||
|
||
err = WriteWavFile(filePath, decodedAudioData, recData.SampleRate, recData.Channels, recData.SampleSize)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
reformatedWavFile, err := ReformatWAV(filePath, 1)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
wavInfo, _ := ReadWavInfo(reformatedWavFile)
|
||
samples, _ := WavBytesToSamples(wavInfo.Data)
|
||
|
||
if saveRecording {
|
||
logger := utils.GetLogger()
|
||
ctx := context.Background()
|
||
|
||
err := utils.CreateFolder("recordings")
|
||
if err != nil {
|
||
err := xerrors.New(err)
|
||
logger.ErrorContext(ctx, "Failed create folder.", slog.Any("error", err))
|
||
}
|
||
|
||
newFilePath := strings.Replace(reformatedWavFile, "tmp/", "recordings/", 1)
|
||
err = os.Rename(reformatedWavFile, newFilePath)
|
||
if err != nil {
|
||
logger.ErrorContext(ctx, "Failed to move file.", slog.Any("error", err))
|
||
}
|
||
}
|
||
|
||
utils.DeleteFile(fileName)
|
||
utils.DeleteFile(reformatedWavFile)
|
||
|
||
return samples, nil
|
||
}
|