import numpy as np
from config import paths
import os
from moviepy.editor import VideoFileClip
import torchaudio
from speechbrain.inference.speaker import EncoderClassifier

class EmbeddingAudioService:

    def __init__(self, video_path):
        duration: int = 60 ## just first one minute of a lecture --> it should be enough to determine language and gender
        # Load the video
        video = VideoFileClip(video_path)
        audio = None
        # Extract the first `duration` seconds of audio
        if video.audio is not None:
            audio = video.audio.subclip(0, duration)
        # Save the audio to file
        self.audio_filepath = os.path.join(paths.AUDIO_OUTPUT_FOLDER, "tmp_audio.wav")
        if audio is not None:
            audio.write_audiofile(self.audio_filepath, codec="pcm_s16le")
        # Extract the first `duration` seconds of audio
        audio = video.audio.subclip(0, duration)
        # print(f"Audio extracted and saved to {self.audio_filepath}")

    def create_embedding(self) -> np.ndarray:
        # Load model (ECAPA-TDNN trained on VoxCeleb)
        classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")

        # Load audio file
        try:
            signal, fs = torchaudio.load(self.audio_filepath)
        except Exception as e:
            import soundfile as sf
            print("Error in torchaudio.load ")
            signal, fs = sf.read(self.audio_filepath)

        # Maybe just cut to prevent too long
        # Get embedding (512-dim vector)
        embedding = classifier.encode_batch(signal)

        # print(embedding.shape)  # [1, 1, 192] nebo [1, 512] podle verze
        # print(embedding.squeeze().detach().numpy())
        return embedding.squeeze().detach().numpy().flatten()
