package wav import ( "bytes" "context" "encoding/base64" "encoding/binary" "encoding/json" "errors" "fmt" "io/ioutil" "log/slog" "os" "os/exec" "song-recognition/models" "song-recognition/utils" "strings" "time" "github.com/mdobak/go-xerrors" ) // WavHeader defines the structure of a WAV header type WavHeader struct { ChunkID [4]byte ChunkSize uint32 Format [4]byte Subchunk1ID [4]byte Subchunk1Size uint32 AudioFormat uint16 NumChannels uint16 SampleRate uint32 BytesPerSec uint32 BlockAlign uint16 BitsPerSample uint16 Subchunk2ID [4]byte Subchunk2Size uint32 } func writeWavHeader(f *os.File, data []byte, sampleRate int, channels int, bitsPerSample int) error { // Validate input if len(data)%channels != 0 { return errors.New("data size not divisible by channels") } // Calculate derived values subchunk1Size := uint32(16) // Assuming PCM format bytesPerSample := bitsPerSample / 8 blockAlign := uint16(channels * bytesPerSample) subchunk2Size := uint32(len(data)) // Build WAV header header := WavHeader{ ChunkID: [4]byte{'R', 'I', 'F', 'F'}, ChunkSize: uint32(36 + len(data)), Format: [4]byte{'W', 'A', 'V', 'E'}, Subchunk1ID: [4]byte{'f', 'm', 't', ' '}, Subchunk1Size: subchunk1Size, AudioFormat: uint16(1), // PCM format NumChannels: uint16(channels), SampleRate: uint32(sampleRate), BytesPerSec: uint32(sampleRate * channels * bytesPerSample), BlockAlign: blockAlign, BitsPerSample: uint16(bitsPerSample), Subchunk2ID: [4]byte{'d', 'a', 't', 'a'}, Subchunk2Size: subchunk2Size, } // Write header to file err := binary.Write(f, binary.LittleEndian, header) return err } func WriteWavFile(filename string, data []byte, sampleRate int, channels int, bitsPerSample int) error { f, err := os.Create(filename) if err != nil { return err } defer f.Close() if sampleRate <= 0 || channels <= 0 || bitsPerSample <= 0 { return fmt.Errorf( "values must be greater than zero (sampleRate: %d, channels: %d, bitsPerSample: %d)", sampleRate, channels, bitsPerSample, ) } err = writeWavHeader(f, data, sampleRate, channels, bitsPerSample) if err != nil { return err } _, err = f.Write(data) return err } type WavInfo struct { Channels int SampleRate int Duration float64 Data []byte LeftChannelSamples []float64 RightChannelSamples []float64 } // ReadWavInfo reads a 16-bit PCM WAV file and returns its metadata and audio samples. // Supports mono and stereo files. Note that it only supports 16-bit PCM format. func ReadWavInfo(filename string) (*WavInfo, error) { data, err := ioutil.ReadFile(filename) if err != nil { return nil, err } if len(data) < 44 { return nil, errors.New("invalid WAV file size (too small)") } // Parse PCM header to extract metadata // https://en.wikipedia.org/wiki/WAV#WAV_file_header var header WavHeader if err := binary.Read(bytes.NewReader(data[:44]), binary.LittleEndian, &header); err != nil { return nil, err } if string(header.ChunkID[:]) != "RIFF" || string(header.Format[:]) != "WAVE" || header.AudioFormat != 1 { return nil, errors.New("invalid WAV header format") } info := &WavInfo{ Channels: int(header.NumChannels), SampleRate: int(header.SampleRate), Data: data[44:], } if header.BitsPerSample != 16 { return nil, errors.New("unsupported bits‑per‑sample (expect 16‑bit PCM)") } sampleCount := len(info.Data) / 2 int16Buf := make([]int16, sampleCount) if err := binary.Read(bytes.NewReader(info.Data), binary.LittleEndian, int16Buf); err != nil { return nil, err } const scale = 1.0 / 32768.0 // 16‑bit normalisation factor switch header.NumChannels { case 1: left := make([]float64, sampleCount) for i, s := range int16Buf { left[i] = float64(s) * scale } info.LeftChannelSamples = left case 2: frameCount := sampleCount / 2 left := make([]float64, frameCount) right := make([]float64, frameCount) for i := 0; i < frameCount; i++ { left[i] = float64(int16Buf[2*i]) * scale right[i] = float64(int16Buf[2*i+1]) * scale } info.LeftChannelSamples = left info.RightChannelSamples = right default: return nil, errors.New("unsupported channel count (only mono/stereo)") } // Compute audio duration in seconds info.Duration = float64(sampleCount) / (float64(header.NumChannels) * float64(header.SampleRate)) return info, nil } // WavBytesToFloat64 converts a slice of bytes from a .wav file to a slice of float64 samples func WavBytesToSamples(input []byte) ([]float64, error) { if len(input)%2 != 0 { return nil, errors.New("invalid input length") } numSamples := len(input) / 2 output := make([]float64, numSamples) for i := 0; i < len(input); i += 2 { // Interpret bytes as a 16-bit signed integer (little-endian) sample := int16(binary.LittleEndian.Uint16(input[i : i+2])) // Scale the sample to the range [-1, 1] output[i/2] = float64(sample) / 32768.0 } return output, nil } // FFmpegMetadata represents the metadata structure returned by ffprobe. type FFmpegMetadata struct { Streams []struct { Index int `json:"index"` CodecName string `json:"codec_name"` CodecLongName string `json:"codec_long_name"` CodecType string `json:"codec_type"` SampleFmt string `json:"sample_fmt"` SampleRate string `json:"sample_rate"` Channels int `json:"channels"` ChannelLayout string `json:"channel_layout"` BitsPerSample int `json:"bits_per_sample"` Duration string `json:"duration"` BitRate string `json:"bit_rate"` Disposition map[string]int `json:"disposition"` Tags map[string]string `json:"tags"` } `json:"streams"` Format struct { Streams int `json:"nb_streams"` FormFilename string `json:"filename"` NbatName string `json:"format_name"` FormatLongName string `json:"format_long_name"` StartTime string `json:"start_time"` Duration string `json:"duration"` Size string `json:"size"` BitRate string `json:"bit_rate"` Tags map[string]string `json:"tags"` } `json:"format"` } // GetMetadata retrieves metadata from a file using ffprobe. func GetMetadata(filePath string) (FFmpegMetadata, error) { var metadata FFmpegMetadata cmd := exec.Command("ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", "-show_streams", filePath) var out bytes.Buffer cmd.Stdout = &out err := cmd.Run() if err != nil { return metadata, err } err = json.Unmarshal(out.Bytes(), &metadata) if err != nil { return metadata, err } // convert all keys of the Tags map to lowercase for k, v := range metadata.Format.Tags { metadata.Format.Tags[strings.ToLower(k)] = v } for k, v := range metadata.Streams[0].Tags { metadata.Streams[0].Tags[strings.ToLower(k)] = v } return metadata, nil } func ProcessRecording(recData *models.RecordData, saveRecording bool) ([]float64, error) { decodedAudioData, err := base64.StdEncoding.DecodeString(recData.Audio) if err != nil { return nil, err } now := time.Now() fileName := fmt.Sprintf("%04d_%02d_%02d_%02d_%02d_%02d.wav", now.Second(), now.Minute(), now.Hour(), now.Day(), now.Month(), now.Year(), ) filePath := "tmp/" + fileName err = WriteWavFile(filePath, decodedAudioData, recData.SampleRate, recData.Channels, recData.SampleSize) if err != nil { return nil, err } reformatedWavFile, err := ReformatWAV(filePath, 1) if err != nil { return nil, err } wavInfo, _ := ReadWavInfo(reformatedWavFile) samples, _ := WavBytesToSamples(wavInfo.Data) if saveRecording { logger := utils.GetLogger() ctx := context.Background() err := utils.CreateFolder("recordings") if err != nil { err := xerrors.New(err) logger.ErrorContext(ctx, "Failed create folder.", slog.Any("error", err)) } newFilePath := strings.Replace(reformatedWavFile, "tmp/", "recordings/", 1) err = os.Rename(reformatedWavFile, newFilePath) if err != nil { logger.ErrorContext(ctx, "Failed to move file.", slog.Any("error", err)) } } utils.DeleteFile(fileName) utils.DeleteFile(reformatedWavFile) return samples, nil }