video-compliance-ai/backend/app/services/asr.py

"""
ASR 语音转写服务
集成 Whisper API 实现音频转写
"""
import asyncio
import os
import tempfile
from dataclasses import dataclass, field
from typing import Optional

import httpx


@dataclass
class TranscriptSegment:
    """转写片段"""
    text: str
    start: float  # 开始时间（秒）
    end: float  # 结束时间（秒）
    confidence: float = 1.0


@dataclass
class TranscriptionResult:
    """转写结果"""
    success: bool
    text: str = ""  # 完整文本
    segments: list[TranscriptSegment] = field(default_factory=list)
    language: str = "zh"
    duration: float = 0.0
    error: Optional[str] = None


class ASRService:
    """ASR 语音转写服务"""

    def __init__(
        self,
        api_key: str,
        base_url: str = "https://api.openai.com/v1",
        model: str = "whisper-1",
        timeout: float = 300.0,
    ):
        """
        初始化 ASR 服务

        Args:
            api_key: API Key
            base_url: API 基础 URL
            model: 模型名称
            timeout: 请求超时（秒）
        """
        self.api_key = api_key
        self.base_url = base_url.rstrip("/")
        self.model = model
        self.timeout = timeout

    async def transcribe_file(
        self,
        audio_path: str,
        language: str = "zh",
        response_format: str = "verbose_json",
    ) -> TranscriptionResult:
        """
        转写音频文件

        Args:
            audio_path: 音频文件路径
            language: 语言代码
            response_format: 响应格式

        Returns:
            TranscriptionResult: 转写结果
        """
        if not os.path.exists(audio_path):
            return TranscriptionResult(
                success=False,
                error=f"文件不存在: {audio_path}",
            )

        try:
            async with httpx.AsyncClient(
                timeout=httpx.Timeout(self.timeout)
            ) as client:
                with open(audio_path, "rb") as f:
                    files = {"file": (os.path.basename(audio_path), f, "audio/mpeg")}
                    data = {
                        "model": self.model,
                        "language": language,
                        "response_format": response_format,
                    }

                    response = await client.post(
                        f"{self.base_url}/audio/transcriptions",
                        headers={"Authorization": f"Bearer {self.api_key}"},
                        files=files,
                        data=data,
                    )

                    if response.status_code != 200:
                        return TranscriptionResult(
                            success=False,
                            error=f"API 错误 {response.status_code}: {response.text[:200]}",
                        )

                    result = response.json()
                    return self._parse_response(result, language)

        except Exception as e:
            return TranscriptionResult(
                success=False,
                error=str(e),
            )

    async def transcribe_url(
        self,
        audio_url: str,
        language: str = "zh",
    ) -> TranscriptionResult:
        """
        转写远程音频

        Args:
            audio_url: 音频 URL
            language: 语言代码

        Returns:
            TranscriptionResult: 转写结果
        """
        # 下载音频到临时文件
        temp_path = None
        try:
            async with httpx.AsyncClient(
                timeout=httpx.Timeout(60.0),
                follow_redirects=True,
            ) as client:
                response = await client.get(audio_url)
                if response.status_code != 200:
                    return TranscriptionResult(
                        success=False,
                        error=f"下载音频失败: HTTP {response.status_code}",
                    )

                # 写入临时文件
                with tempfile.NamedTemporaryFile(
                    suffix=".mp3",
                    delete=False,
                ) as f:
                    f.write(response.content)
                    temp_path = f.name

            # 转写
            result = await self.transcribe_file(temp_path, language)
            return result

        except Exception as e:
            return TranscriptionResult(
                success=False,
                error=str(e),
            )
        finally:
            # 清理临时文件
            if temp_path and os.path.exists(temp_path):
                try:
                    os.remove(temp_path)
                except OSError:
                    pass

    def _parse_response(
        self,
        response: dict,
        language: str,
    ) -> TranscriptionResult:
        """解析 API 响应"""
        text = response.get("text", "")
        duration = response.get("duration", 0.0)

        segments = []
        for seg in response.get("segments", []):
            segments.append(TranscriptSegment(
                text=seg.get("text", "").strip(),
                start=seg.get("start", 0.0),
                end=seg.get("end", 0.0),
                confidence=seg.get("confidence", 1.0) if "confidence" in seg else 1.0,
            ))

        # 如果没有分段信息，创建单个分段
        if not segments and text:
            segments = [TranscriptSegment(
                text=text,
                start=0.0,
                end=duration,
            )]

        return TranscriptionResult(
            success=True,
            text=text,
            segments=segments,
            language=language,
            duration=duration,
        )


class AudioExtractor:
    """从视频中提取音频"""

    def __init__(self, ffmpeg_path: str = "ffmpeg"):
        self.ffmpeg_path = ffmpeg_path

    async def extract_audio(
        self,
        video_path: str,
        output_path: Optional[str] = None,
        format: str = "mp3",
        sample_rate: int = 16000,
    ) -> Optional[str]:
        """
        从视频中提取音频

        Args:
            video_path: 视频文件路径
            output_path: 输出路径，默认生成临时文件
            format: 输出格式
            sample_rate: 采样率

        Returns:
            音频文件路径，失败返回 None
        """
        import shutil

        if not shutil.which(self.ffmpeg_path):
            return None

        if output_path is None:
            output_path = tempfile.mktemp(suffix=f".{format}")

        cmd = [
            self.ffmpeg_path,
            "-i", video_path,
            "-vn",  # 不要视频
            "-acodec", "libmp3lame" if format == "mp3" else "pcm_s16le",
            "-ar", str(sample_rate),
            "-ac", "1",  # 单声道
            "-y",
            output_path,
        ]

        try:
            process = await asyncio.create_subprocess_exec(
                *cmd,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE,
            )
            _, stderr = await process.communicate()

            if process.returncode != 0:
                return None

            return output_path

        except Exception:
            return None


class VideoASRService:
    """视频 ASR 服务（组合音频提取和转写）"""

    def __init__(
        self,
        api_key: str,
        base_url: str = "https://api.openai.com/v1",
        model: str = "whisper-1",
    ):
        self.asr = ASRService(api_key, base_url, model)
        self.audio_extractor = AudioExtractor()

    async def transcribe_video(
        self,
        video_path: str,
        language: str = "zh",
    ) -> TranscriptionResult:
        """
        转写视频中的语音

        Args:
            video_path: 视频文件路径
            language: 语言代码

        Returns:
            TranscriptionResult: 转写结果
        """
        # 提取音频
        audio_path = await self.audio_extractor.extract_audio(video_path)
        if not audio_path:
            return TranscriptionResult(
                success=False,
                error="音频提取失败，请确保 FFmpeg 已安装",
            )

        try:
            # 转写
            result = await self.asr.transcribe_file(audio_path, language)
            return result
        finally:
            # 清理临时音频
            if os.path.exists(audio_path):
                try:
                    os.remove(audio_path)
                except OSError:
                    pass