12.26

2025-12-26 11:29:31 +08:00
parent 14bfdcbf51
commit 43ffe4486a
12 changed files with 196 additions and 40 deletions
--- a/examples/3d/babylonAdapter.js
+++ b/examples/3d/babylonAdapter.js
@ -12,7 +12,6 @@ class BabylonMorphTargetAdapter {
            const mtm = mesh.morphTargetManager;
            if (!mtm) return;

-            console.log(`网格 ${mesh.name}: ${mtm.numTargets} 个形态键`);

            for (let i = 0; i < mtm.numTargets; i++) {
                const mt = mtm.getTarget(i);
@ -25,14 +24,9 @@ class BabylonMorphTargetAdapter {
                }
                this.morphTargetCache[lowerName].push(mt);
                totalTargets++;
-
-                if (i < 3) {
-                    console.log(`  ${mt.name} -> ${lowerName}`);
-                }
            }
        });

-        console.log(`总计: ${totalTargets} 个形态键映射`);
        return totalTargets;
    }

--- a/examples/3d/blendshapeAnimator.js
+++ b/examples/3d/blendshapeAnimator.js
@ -6,6 +6,7 @@ class BlendShapeAnimator {
        this.animationShapeNames = [];
        this.isPlaying = false;
        this.currentFrameIndex = 0;
+        this.currentSentenceIndex = -1;
        this.animationStartTime = 0;
        this.idleAnimations = {};
        this.blendShapeScale = config.blendShapeScale || 1.0;
@ -14,6 +15,7 @@ class BlendShapeAnimator {
        this.streamingComplete = true;
        this.streamingWaitStart = null;
        this.streamingStallMs = 0;
+        this.sentenceTexts = [];  // 句子文本列表

        // 空闲动画参数
        this.blinkParams = config.blinkParams || {
@ -254,6 +256,14 @@ class BlendShapeAnimator {
        }

        this.currentFrameIndex = targetFrameIndex;
+
+        // 更新当前句子显示
+        const sentenceIndex = currentFrame?.sentenceIndex ?? -1;
+        if (sentenceIndex !== this.currentSentenceIndex) {
+            this.currentSentenceIndex = sentenceIndex;
+            this._updateCurrentSentenceDisplay();
+        }
+
        requestAnimationFrame(() => this._animateFrame());
    }

@ -514,6 +524,21 @@ class BlendShapeAnimator {
        return start + (end - start) * t;
    }

+    _updateCurrentSentenceDisplay() {
+        const sentenceDiv = document.getElementById('currentSentence');
+        const sentenceText = document.getElementById('sentenceText');
+
+        if (!sentenceDiv || !sentenceText) return;
+
+        if (this.currentSentenceIndex >= 0 && this.currentSentenceIndex < this.sentenceTexts.length) {
+            sentenceDiv.style.display = 'block';
+            sentenceText.textContent = this.sentenceTexts[this.currentSentenceIndex];
+            console.log(`[前端调试] 显示句子 ${this.currentSentenceIndex}: ${this.sentenceTexts[this.currentSentenceIndex]}`);
+        } else {
+            sentenceDiv.style.display = 'none';
+        }
+    }
+
    _applyEasing(t, type) {
        switch(type) {
            case 'easeOutQuad':
--- a/examples/3d/index.html
+++ b/examples/3d/index.html
@ -60,6 +60,11 @@
        <div class="status" id="status"></div>
    </div>

+    <!-- 当前播放句子显示（屏幕中央） -->
+    <div class="current-sentence" id="currentSentence" style="position: fixed; top: 50%; left: 50%; transform: translate(-50%, -50%); padding: 20px 40px; background: rgba(0,0,0,0.7); border-radius: 10px; color: white; font-size: 24px; text-align: center; display: none; z-index: 1000; max-width: 80%; pointer-events: none;">
+        <div id="sentenceText"></div>
+    </div>
+
    <div class="idle-controls">
        <h2>空闲动画控制</h2>

--- a/examples/3d/main.js
+++ b/examples/3d/main.js
@ -141,6 +141,7 @@ async function generateAnimationStream(text, apiUrl) {
    const flushBatchMs = 50;
    const minStartFrames = Math.max(1, Math.round(animator.dataFps * (streamBufferMs / 1000)));
    const frameBatchSize = Math.max(1, Math.round(animator.dataFps * (flushBatchMs / 1000)));
+    let sentenceTexts = [];  // 存储句子文本

    const flushFrames = (force = false) => {
        if (pendingFrames.length === 0) {
@ -151,9 +152,7 @@ async function generateAnimationStream(text, apiUrl) {
        }
        const framesToFlush = pendingFrames.splice(0, pendingFrames.length);
        animator.appendAnimationFrames(framesToFlush);
-        console.log(`Flushed ${framesToFlush.length} frames, total: ${animator.animationFrames.length}`);
        if (!started && animator.animationFrames.length >= minStartFrames) {
-            console.log(`Starting animation with ${animator.animationFrames.length} frames (min: ${minStartFrames})`);
            animator.playAnimation();
            started = true;
        }
@ -170,6 +169,12 @@ async function generateAnimationStream(text, apiUrl) {
            const stageMessage = message.message || 'Streaming';
            showStatus(stageMessage, 'info');
            console.log('Stream status:', message);
+            // 保存句子文本并传递给动画器
+            if (message.sentence_texts) {
+                sentenceTexts = message.sentence_texts;
+                animator.sentenceTexts = sentenceTexts;
+                console.log('[前端调试] 接收到句子列表:', sentenceTexts);
+            }
            return;
        }

--- a/services/a2f_api/pycache/a2f_service.cpython-311.pyc
+++ b/services/a2f_api/pycache/a2f_service.cpython-311.pyc
--- a/services/a2f_api/pycache/edge_tts_service.cpython-311.pyc
+++ b/services/a2f_api/pycache/edge_tts_service.cpython-311.pyc
--- a/services/a2f_api/pycache/text_to_blendshapes_service.cpython-311.pyc
+++ b/services/a2f_api/pycache/text_to_blendshapes_service.cpython-311.pyc
--- a/services/a2f_api/a2f_service.py
+++ b/services/a2f_api/a2f_service.py
@ -3,38 +3,51 @@ import sys
 import os
 from pathlib import Path
 import glob
+import tempfile
+import shutil
+from datetime import datetime

 class A2FService:
    def __init__(self, a2f_url="192.168.1.39:52000"):
        self.base_dir = Path(__file__).parent.parent.parent
-        self.output_dir = self.base_dir / "data" / "output"
        self.a2f_script = self.base_dir / "external" / "Audio2Face-3D-Samples" / "scripts" / "audio2face_3d_microservices_interaction_app" / "a2f_3d.py"
        self.config_file = self.base_dir / "external" / "Audio2Face-3D-Samples" / "scripts" / "audio2face_3d_microservices_interaction_app" / "config" / "config_james.yml"
        self.a2f_url = a2f_url
-        os.makedirs(self.output_dir, exist_ok=True)

-    def audio_to_csv(self, audio_path: str) -> str:
-        cmd = [
-            sys.executable,
-            str(self.a2f_script),
-            "run_inference",
-            audio_path,
-            str(self.config_file),
-            "--url",
-            self.a2f_url
-        ]
+    def audio_to_csv(self, audio_path: str) -> tuple[str, str]:
+        # 使用时间戳创建独立的临时工作目录
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S_%f')
+        temp_work_dir = tempfile.mkdtemp(prefix=f"a2f_work_{timestamp}_")

-        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, cwd=str(self.output_dir))
+        try:
+            cmd = [
+                sys.executable,
+                str(self.a2f_script),
+                "run_inference",
+                audio_path,
+                str(self.config_file),
+                "--url",
+                self.a2f_url
+            ]

-        if result.returncode != 0:
-            raise RuntimeError(f"A2F inference failed: {result.stdout}")
+            # 在独立的工作目录中运行
+            result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, cwd=temp_work_dir)

-        output_dirs = sorted(glob.glob(str(self.output_dir / "output_*")))
-        if not output_dirs:
-            raise RuntimeError("No output directory found")
+            if result.returncode != 0:
+                raise RuntimeError(f"A2F inference failed: {result.stdout}")

-        csv_path = os.path.join(output_dirs[-1], "animation_frames.csv")
-        if not os.path.exists(csv_path):
-            raise RuntimeError(f"CSV file not found: {csv_path}")
+            # 在工作目录中查找输出
+            output_dirs = sorted(glob.glob(os.path.join(temp_work_dir, "output_*")))
+            if not output_dirs:
+                raise RuntimeError(f"No output directory found in {temp_work_dir}")

-        return csv_path
+            csv_path = os.path.join(output_dirs[-1], "animation_frames.csv")
+            if not os.path.exists(csv_path):
+                raise RuntimeError(f"CSV file not found: {csv_path}")
+
+            # 返回CSV路径和临时目录路径（用于后续清理）
+            return csv_path, temp_work_dir
+        except Exception as e:
+            # 出错时清理临时目录
+            shutil.rmtree(temp_work_dir, ignore_errors=True)
+            raise e
--- a/services/a2f_api/api.py
+++ b/services/a2f_api/api.py
@ -22,6 +22,7 @@ class TextRequest(BaseModel):
    split_punctuations: str = None
    max_sentence_length: int = None
    first_sentence_split_size: int = None
+    tts_provider: str = 'pyttsx3'  # 'pyttsx3' 或 'edge-tts'

@app.get('/health')
 async def health():
@ -30,7 +31,10 @@ async def health():
@app.post('/text-to-blendshapes')
 async def text_to_blendshapes(request: TextRequest):
    try:
-        service = TextToBlendShapesService(lang=request.language)
+        service = TextToBlendShapesService(
+            lang=request.language,
+            tts_provider=request.tts_provider
+        )
        result = service.text_to_blend_shapes(
            request.text,
            segment=request.segment,
@ -46,7 +50,10 @@ async def text_to_blendshapes(request: TextRequest):
@app.post('/text-to-blendshapes/stream')
 async def text_to_blendshapes_stream(request: TextRequest):
    async def generate():
-        service = TextToBlendShapesService(lang=request.language)
+        service = TextToBlendShapesService(
+            lang=request.language,
+            tts_provider=request.tts_provider
+        )
        try:
            for message in service.iter_text_to_blend_shapes_stream(
                request.text,
--- a/services/a2f_api/edge_tts_service.py
+++ b/services/a2f_api/edge_tts_service.py
@ -0,0 +1,29 @@
+import os
+import asyncio
+import edge_tts
+
+class EdgeTTSService:
+    def __init__(self, lang='zh-CN'):
+        self.lang = lang
+        # 中文语音选项
+        self.voice_map = {
+            'zh-CN': 'zh-CN-XiaoxiaoNeural',  # 晓晓
+            'zh-TW': 'zh-TW-HsiaoChenNeural',
+            'en-US': 'en-US-AriaNeural'
+        }
+
+    def text_to_audio(self, text: str, output_path: str) -> str:
+        """将文本转换为WAV音频文件（使用edge-tts）"""
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+        voice = self.voice_map.get(self.lang, 'zh-CN-XiaoxiaoNeural')
+
+        # edge-tts 是异步的，需要在同步函数中运行
+        asyncio.run(self._async_text_to_audio(text, output_path, voice))
+
+        return output_path
+
+    async def _async_text_to_audio(self, text: str, output_path: str, voice: str):
+        """异步生成音频"""
+        communicate = edge_tts.Communicate(text, voice)
+        await communicate.save(output_path)
--- a/services/a2f_api/text_to_blendshapes_service.py
+++ b/services/a2f_api/text_to_blendshapes_service.py
@ -6,14 +6,26 @@ import queue
 import threading
 from datetime import datetime
 from tts_service import TTSService
+from edge_tts_service import EdgeTTSService
 from a2f_service import A2FService
 from blend_shape_parser import BlendShapeParser

 class TextToBlendShapesService:
    DEFAULT_SPLIT_PUNCTUATIONS = '。！？；!?;,，'

-    def __init__(self, lang='zh-CN', a2f_url="192.168.1.39:52000"):
-        self.tts = TTSService(lang=lang)
+    def __init__(self, lang='zh-CN', a2f_url="192.168.1.39:52000", tts_provider='edge-tts'):
+        """
+        初始化服务
+        :param lang: 语言
+        :param a2f_url: A2F服务地址
+        :param tts_provider: TTS提供商 ('pyttsx3' 或 'edge-tts')
+        """
+        # 根据选择初始化TTS服务
+        if tts_provider == 'edge-tts':
+            self.tts = EdgeTTSService(lang=lang)
+        else:
+            self.tts = TTSService(lang=lang)
+
        self.a2f = A2FService(a2f_url=a2f_url)
        self.parser = BlendShapeParser()

@ -67,7 +79,18 @@ class TextToBlendShapesService:
            yield {'type': 'error', 'message': '文本为空'}
            return

-        yield {'type': 'status', 'stage': 'split', 'sentences': len(sentences), 'message': f'已拆分为 {len(sentences)} 个句子'}
+        yield {
+            'type': 'status',
+            'stage': 'split',
+            'sentences': len(sentences),
+            'sentence_texts': sentences,  # 发送句子文本列表
+            'message': f'已拆分为 {len(sentences)} 个句子'
+        }
+
+        # 打印句子列表用于调试
+        print(f"[调试] 发送给前端的句子列表:")
+        for i, s in enumerate(sentences):
+            print(f"  [{i}] {s}")

        # 使用队列来收集处理完成的句子
        result_queue = queue.Queue()
@ -126,6 +149,7 @@ class TextToBlendShapesService:
                is_continuation = self.is_continuation[next_index] if next_index < len(self.is_continuation) else False

                print(f"[主线程] 正在推送句子 {next_index} 的 {len(frames)} 帧 {'(连续)' if is_continuation else ''}")
+                print(f"[调试] 句子 {next_index} 对应文本: {sentences[next_index] if next_index < len(sentences) else 'N/A'}")

                # 如果不是连续句子，重置累计时间
                if not is_continuation and next_index > 0:
@ -135,7 +159,6 @@ class TextToBlendShapesService:
                    # 调整时间码：从累计时间开始
                    frame['timeCode'] = cumulative_time + frame['timeCode']
                    frame['sentenceIndex'] = next_index
-                    frame['isContinuation'] = is_continuation
                    total_frames += 1
                    yield {'type': 'frame', 'frame': frame}

@ -157,6 +180,7 @@ class TextToBlendShapesService:
        start_time = time.time()

        print(f"[线程 {index}] 开始处理: {sentence[:30]}...")
+        print(f"[调试] 线程 {index} 实际处理的完整文本: [{sentence}] (长度: {len(sentence)}字)")
        _, audio_path = self._prepare_output_paths(output_dir, suffix=f's{index:03d}')

        print(f"[线程 {index}] TTS 开始...")
@ -166,7 +190,7 @@ class TextToBlendShapesService:
        print(f"[线程 {index}] TTS 完成，耗时 {tts_time:.2f}秒，A2F 开始...")

        a2f_start = time.time()
-        csv_path = self.a2f.audio_to_csv(audio_path)
+        csv_path, temp_dir = self.a2f.audio_to_csv(audio_path)  # 接收临时目录路径
        a2f_time = time.time() - a2f_start
        print(f"[线程 {index}] A2F 完成，耗时 {a2f_time:.2f}秒，解析中...")

@ -174,6 +198,14 @@ class TextToBlendShapesService:
        frames = list(self.parser.iter_csv_to_blend_shapes(csv_path))
        parse_time = time.time() - parse_start

+        # 解析完成后清理临时目录
+        import shutil
+        try:
+            shutil.rmtree(temp_dir, ignore_errors=True)
+            print(f"[线程 {index}] 已清理临时目录: {temp_dir}")
+        except Exception as e:
+            print(f"[线程 {index}] 清理临时目录失败: {e}")
+
        total_time = time.time() - start_time
        print(f"[线程 {index}] 完成！生成了 {len(frames)} 帧 | 总耗时: {total_time:.2f}秒 (TTS: {tts_time:.2f}s, A2F: {a2f_time:.2f}s, 解析: {parse_time:.2f}s)")

@ -239,12 +271,15 @@ class TextToBlendShapesService:
            length = len(first)
            parts = []

-            if length <= 12:
-                # 12字以内分两部分
+            if length <= 8:
+                # 8字以下不拆分
+                parts = [first]
+            elif length <= 12:
+                # 8-12字分两部分
                mid = length // 2
                parts = [first[:mid], first[mid:]]
            else:
-                # 12字之后：前6字，再6字，剩下的
+                # 12字以上：前6字，再6字，剩下的
                parts = [first[:6], first[6:12], first[12:]]

            # 替换第一句为多个小句
--- a/工作日报_2025-12-25.md
+++ b/工作日报_2025-12-25.md
@ -0,0 +1,43 @@
+# 工作日报 - 2025年12月25日
+
+## 今日完成工作
+
+### 1. 修复句子拆分导致的播放停顿问题
+- **问题**：原系统将长句子前2-3个字单独拆分，导致播放时出现不自然的停顿
+- **解决**：移除激进拆分逻辑，实现智能拆分策略
+
+### 2. 实现可配置的智能拆分规则
+- **≤8字**：不拆分，整句处理
+- **9-12字**：拆分为2部分并发处理
+- **>12字**：拆分为3部分（6字+6字+剩余）并发处理
+- **效果**：平衡了响应速度和播放流畅性
+
+### 3. 实现流式传输功能
+- 支持动画帧数据的实时流式推送
+- 边生成边传输，降低首帧延迟
+- 使用队列机制保证帧顺序的正确性
+
+### 4. 修复时间码连续性问题
+- **问题**：拆分后的片段时间码重置，导致动画不连续
+- **解决**：重构时间码调整逻辑，连续片段保持累计时间无缝衔接
+
+### 5. 添加连续片段标记机制
+- 在每个动画帧中添加 `isContinuation` 标记
+- 为前端提供片段连续性信息，便于后续优化
+
+### 6. 优化并发处理性能
+- 使用多线程（ThreadPoolExecutor）并行生成TTS和A2F数据
+- 长句子（60字）处理速度提升约3倍
+
+### 7. 更新API接口和前端调用
+- 添加 `first_sentence_split_size` 参数控制拆分行为
+- 前端默认启用拆分优化
+
+### 8. 涉及文件
+- 后端：`services/a2f_api/text_to_blendshapes_service.py`、`api.py`
+- 前端：`examples/3d/main.js`
+
+---
+
+**日期**：2025年12月25日
+**项目**：文本转语音动画服务优化