import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import spaces
import numpy as np
from pydub import AudioSegment
import tempfile
import os

# Model configuration - Using CrisperWhisper for TRUE verbatim transcription
# CrisperWhisper is designed to transcribe EVERY word including um, uh, fillers, stutters, false starts
MODEL_NAME = "nyrahealth/CrisperWhisper"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

print(f"Loading {MODEL_NAME} for verbatim transcription...")

# Load model and processor
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(MODEL_NAME)

# Create pipeline optimized for verbatim output
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,
    batch_size=8,  # Reduced batch size for stability
    return_timestamps="word",  # CrisperWhisper provides accurate word-level timestamps
    torch_dtype=torch_dtype,
    device=device,
)

print("Model loaded successfully!")

def get_audio_duration(audio_path):
    """Get duration of audio file in seconds."""
    try:
        audio = AudioSegment.from_file(audio_path)
        return len(audio) / 1000.0
    except:
        return None

def slice_audio(audio_path, chunk_duration=300):
    """
    Slice audio into chunks of specified duration (in seconds).
    Default is 5 minutes (300 seconds) per chunk.
    """
    audio = AudioSegment.from_file(audio_path)
    duration_ms = len(audio)
    chunk_duration_ms = chunk_duration * 1000
    
    chunks = []
    for i in range(0, duration_ms, chunk_duration_ms):
        chunk = audio[i:i + chunk_duration_ms]
        
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
            chunk.export(temp_file.name, format="wav")
            chunks.append(temp_file.name)
    
    return chunks

@spaces.GPU
def transcribe_audio_chunk(audio_input, task="transcribe", language=None):
    """
    Transcribe a single audio chunk with CrisperWhisper.
    This model is specifically trained for verbatim transcription.
    """
    try:
        generate_kwargs = {
            "task": task,
        }
        
        if language:
            generate_kwargs["language"] = language
        
        # CrisperWhisper automatically provides verbatim transcription
        result = pipe(audio_input, generate_kwargs=generate_kwargs)
        return result
    except Exception as e:
        # Fallback: try without generate_kwargs if there's a tensor mismatch
        print(f"Error with generate_kwargs: {e}")
        try:
            result = pipe(audio_input)
            return result
        except Exception as e2:
            raise Exception(f"Transcription failed: {str(e2)}")

def transcribe_audio(audio, task="transcribe", return_timestamps=False, language=None, progress=gr.Progress()):
    """
    Transcribe audio with VERY VERBATIM output using CrisperWhisper.
    CrisperWhisper transcribes every spoken word exactly as it is, including:
    - Fillers (um, uh, ah, er, mm)
    - Pauses and hesitations
    - Stutters and repetitions
    - False starts
    - Non-standard utterances
    """
    if audio is None:
        return "Please provide an audio file or recording."
    
    temp_files = []
    
    try:
        # Handle different audio input formats
        if isinstance(audio, str):
            audio_path = audio
        elif isinstance(audio, tuple):
            sr, audio_data = audio
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                import scipy.io.wavfile
                scipy.io.wavfile.write(temp_file.name, sr, audio_data)
                audio_path = temp_file.name
                temp_files.append(audio_path)
        else:
            return "Unsupported audio format."
        
        # Check audio duration and slice if necessary
        duration = get_audio_duration(audio_path)
        chunk_duration = 300  # 5 minutes per chunk
        
        if duration and duration > chunk_duration:
            progress(0, desc=f"Audio is {duration:.1f}s long. Slicing into chunks...")
            audio_chunks = slice_audio(audio_path, chunk_duration)
            temp_files.extend(audio_chunks)
        else:
            audio_chunks = [audio_path]
        
        # Process each chunk
        all_transcriptions = []
        total_chunks = len(audio_chunks)
        
        for idx, chunk_path in enumerate(audio_chunks):
            progress((idx + 1) / total_chunks, desc=f"Transcribing chunk {idx + 1}/{total_chunks}...")
            
            result = transcribe_audio_chunk(chunk_path, task, language)
            
            if return_timestamps and "chunks" in result:
                chunk_offset = idx * chunk_duration
                chunk_text = result["text"]
                timestamp_text = []
                
                for word_chunk in result["chunks"]:
                    start = word_chunk["timestamp"][0]
                    end = word_chunk["timestamp"][1]
                    if start is not None and end is not None:
                        timestamp_text.append({
                            "start": start + chunk_offset,
                            "end": end + chunk_offset,
                            "text": word_chunk["text"]
                        })
                
                all_transcriptions.append({
                    "text": chunk_text,
                    "timestamps": timestamp_text
                })
            else:
                all_transcriptions.append({
                    "text": result["text"],
                    "timestamps": []
                })
        
        # Combine all transcriptions
        full_text = " ".join([t["text"] for t in all_transcriptions])
        
        output = f"**Verbatim Transcription:**\n{full_text}\n"
        
        if return_timestamps:
            output += "\n**Word-level Timestamps:**\n"
            for trans in all_transcriptions:
                for ts in trans["timestamps"]:
                    output += f"[{ts['start']:.2f}s - {ts['end']:.2f}s] {ts['text']}\n"
        
        if duration:
            output += f"\n*Total duration: {duration:.1f}s | Processed in {total_chunks} chunk(s)*"
        
        return output
        
    except Exception as e:
        return f"Error during transcription: {str(e)}"
    
    finally:
        # Clean up temporary files
        for temp_file in temp_files:
            try:
                if os.path.exists(temp_file):
                    os.unlink(temp_file)
            except:
                pass

# Language options for manual selection
LANGUAGES = {
    "Auto-detect": None,
    "English": "en",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Italian": "it",
    "Portuguese": "pt",
    "Dutch": "nl",
    "Russian": "ru",
    "Chinese": "zh",
    "Japanese": "ja",
    "Korean": "ko",
    "Arabic": "ar",
    "Hindi": "hi",
    "Turkish": "tr",
    "Polish": "pl",
    "Ukrainian": "uk",
    "Vietnamese": "vi",
    "Thai": "th",
    "Indonesian": "id",
    "Czech": "cs",
    "Romanian": "ro",
    "Swedish": "sv",
    "Danish": "da",
    "Norwegian": "no",
    "Finnish": "fi",
    "Greek": "el",
    "Hebrew": "he",
}

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # 🎙️ Very Verbatim Multilingual Speech-to-Text
        
        Powered by **CrisperWhisper** - specifically designed for verbatim transcription with ZeroGPU acceleration.
        
        ## 🔥 TRUE Verbatim Transcription
        
        Unlike standard Whisper (which omits disfluencies), **CrisperWhisper captures EVERYTHING**:
        
        - ✅ **Fillers**: um, uh, ah, er, mm, like, you know
        - ✅ **Hesitations**: pauses, breath sounds, stutters
        - ✅ **False Starts**: "I was- I went to the store"
        - ✅ **Repetitions**: "I I I think that..."
        - ✅ **Disfluencies**: Every non-fluent speech element
        - ✅ **Accurate Word-Level Timestamps**: Precise timing even around disfluencies
        - ✅ **Multilingual**: Supports 99+ languages
        - ✅ **Long Audio Support**: Automatic 5-minute chunking
        
        **Perfect for:** Legal transcription, linguistic research, therapy sessions, interviews, 
        conversational AI training, or any use case requiring exact speech capture.
        """
    )
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="Audio Input"
            )
            
            with gr.Row():
                task_radio = gr.Radio(
                    choices=["transcribe", "translate"],
                    value="transcribe",
                    label="Task",
                    info="Transcribe verbatim or translate to English"
                )
                
                language_dropdown = gr.Dropdown(
                    choices=list(LANGUAGES.keys()),
                    value="Auto-detect",
                    label="Language",
                    info="Select language or use auto-detect"
                )
            
            timestamps_checkbox = gr.Checkbox(
                label="Show word-level timestamps",
                value=True,
                info="Display precise timing for each word"
            )
            
            transcribe_btn = gr.Button("🎯 Transcribe Verbatim", variant="primary", size="lg")
        
        with gr.Column():
            output_text = gr.Textbox(
                label="Verbatim Transcription (includes all um, uh, hesitations)",
                lines=20,
                show_copy_button=True,
                placeholder="Your VERY verbatim transcription will appear here...\n\nEvery um, uh, stutter, and hesitation will be captured!"
            )
    
    gr.Markdown(
        """
        ### Why CrisperWhisper for Verbatim?
        
        **Standard Whisper** is trained to transcribe the "intended meaning" - it automatically cleans up:
        - ❌ Removes "um", "uh", "ah" 
        - ❌ Omits false starts
        - ❌ Skips repetitions
        - ❌ Ignores stutters
        
        **CrisperWhisper** is specifically trained for verbatim transcription:
        - ✅ Keeps every filler word
        - ✅ Preserves all disfluencies
        - ✅ Captures exact speech patterns
        - ✅ Accurate timestamps around hesitations
        
        ### Example Comparison
        
        **Input Audio:** "Um, so, uh, I was- I was thinking that, like, we could- we could go to the, uh, the store"
        
        **Standard Whisper:** "So I was thinking that we could go to the store"
        
        **CrisperWhisper:** "Um, so, uh, I was- I was thinking that, like, we could- we could go to the, uh, the store"
        
        ### Use Cases
        
        - **Legal/Court Transcription**: Exact wording required by law
        - **Linguistic Research**: Study of natural speech patterns and disfluencies
        - **Medical/Therapy Sessions**: Capturing patient speech patterns
        - **Interview Transcription**: Preserving speaker mannerisms
        - **Conversational AI Training**: Realistic dialogue data
        - **Accessibility**: Providing complete transcripts for deaf/hard-of-hearing
        - **Language Learning**: Analyzing natural spoken language
        
        ### Tips for Best Results
        
        - Clear audio with minimal background noise works best
        - The model captures quiet speech - ensure consistent audio levels
        - Manual language selection can improve accuracy
        - Long files are automatically processed in 5-minute chunks
        - Timestamps help identify exact moments of hesitations
        """
    )
    
    # Set up event handler
    def transcribe_wrapper(audio, task, timestamps, language_name, progress=gr.Progress()):
        language_code = LANGUAGES[language_name]
        return transcribe_audio(audio, task, timestamps, language_code, progress)
    
    transcribe_btn.click(
        fn=transcribe_wrapper,
        inputs=[audio_input, task_radio, timestamps_checkbox, language_dropdown],
        outputs=output_text
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()