流式传输

2025-12-25 15:36:35 +08:00
parent e56f47076c
commit 14bfdcbf51
19 changed files with 1191 additions and 65 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@ -5,7 +5,15 @@
      "Bash(python run.py:*)",
      "Bash(tree:*)",
      "Bash(ls:*)",
-      "Bash(dir:*)"
+      "Bash(dir:*)",
      "Bash(python:*)",
      "Bash(cat:*)",
      "Bash(netstat:*)",
      "Bash(findstr:*)",
      "Bash(taskkill:*)",
      "Bash(where python:*)",
      "Bash(source:*)",
      "Bash(curl:*)"
    ]
  }
 }
--- a/examples/3d/SDK_USAGE.md
+++ b/examples/3d/SDK_USAGE.md
@ -87,6 +87,15 @@ animator.playAnimation();
 **`loadAnimationFrames(frames)`**
 加载动画帧数据
 **`appendAnimationFrames(frames)`**
 追加动画帧数据（流式场景）
 **`startStreaming()`**
 开启流式模式，允许边接收边播放
 **`endStreaming()`**
 结束流式模式
 **`playAnimation()`**
 播放动画
--- a/examples/3d/blendshapeAnimator.js
+++ b/examples/3d/blendshapeAnimator.js
@ -10,6 +10,10 @@ class BlendShapeAnimator {
        this.idleAnimations = {};
        this.blendShapeScale = config.blendShapeScale || 1.0;
        this.dataFps = config.dataFps || 30;
        this.isStreaming = false;
        this.streamingComplete = true;
        this.streamingWaitStart = null;
        this.streamingStallMs = 0;
        // 空闲动画参数
        this.blinkParams = config.blinkParams || {
@ -36,6 +40,14 @@ class BlendShapeAnimator {
        this.enabledExpressions = new Set();
        this.expressionDurations = {};
        // 播放动画时禁用的 blendshape 列表（由空闲动画控制）
        this.disabledShapesInAnimation = config.disabledShapesInAnimation || [
            'eyeblinkleft', 'eyeblinkright',  // 眨眼
            'browdownleft', 'browdownright',   // 眉毛下
            'browinnerup',                      // 眉毛内上
            'browouterupleft', 'browouterupright' // 眉毛外上
        ];
        // 状态标志
        this.isBlinkEnabled = false;
        this.isEyeLookEnabled = false;
@ -63,6 +75,43 @@ class BlendShapeAnimator {
        this.animationShapeNames = this._collectAnimationShapeNames(this.animationFrames);
    }
    appendAnimationFrames(frames) {
        if (!Array.isArray(frames) || frames.length === 0) {
            return;
        }
        this.animationFrames.push(...frames);
        const newNames = this._collectAnimationShapeNames(frames);
        if (newNames.length === 0) {
            return;
        }
        const existingNames = new Set(this.animationShapeNames);
        newNames.forEach(name => {
            if (!existingNames.has(name)) {
                existingNames.add(name);
                this.animationShapeNames.push(name);
            }
        });
    }
    startStreaming() {
        console.log('Starting streaming mode');
        this.isStreaming = true;
        this.streamingComplete = false;
        this.streamingWaitStart = null;
        this.streamingStallMs = 0;
    }
    endStreaming() {
        console.log('Ending streaming mode');
        this.streamingComplete = true;
        this.isStreaming = false;
        this.streamingWaitStart = null;
        this.streamingStallMs = 0;
    }
    // 播放动画
    playAnimation() {
        if (this.animationFrames.length === 0) {
@ -75,15 +124,26 @@ class BlendShapeAnimator {
            return;
        }
-        // 停止随机表情
+        // 停止并立即重置眼球移动
        if (this.isEyeLookEnabled) {
            this._stopRandomEyeLook();
            this._immediateResetEyeLook();
        }
        // 停止并立即重置随机表情
        if (this.isExpressionEnabled && window.ExpressionLibrary) {
            window.ExpressionLibrary.randomPlayer.stop();
            this._immediateResetExpressions();
        }
        // 注意：不停止眨眼，让眨眼继续运行
        this.stopAnimation(false);
        this.isPlaying = true;
        this.currentFrameIndex = 0;
        this.animationStartTime = performance.now();
        this.streamingWaitStart = null;
        this.streamingStallMs = 0;
        this._animateFrame();
        this.onStatusChange('info', '播放中...');
@ -94,6 +154,11 @@ class BlendShapeAnimator {
        this.isPlaying = false;
        this._resetAnimationInfluences();
        // 恢复眼球移动
        if (resumeExpressions && this.isEyeLookEnabled) {
            this._startRandomEyeLook();
        }
        // 恢复随机表情
        if (resumeExpressions && this.isExpressionEnabled && window.ExpressionLibrary) {
            window.ExpressionLibrary.randomPlayer.start();
@ -130,12 +195,35 @@ class BlendShapeAnimator {
        if (!this.isPlaying) return;
        const now = performance.now();
        if (this.streamingWaitStart !== null && this.animationFrames.length > this.currentFrameIndex + 1) {
            this.streamingStallMs += now - this.streamingWaitStart;
            this.streamingWaitStart = null;
        }
        const frameDuration = 1000 / this.dataFps;
-        const elapsed = now - this.animationStartTime;
+        const elapsed = now - this.animationStartTime - this.streamingStallMs;
        const exactFrame = elapsed / frameDuration;
        const targetFrameIndex = Math.floor(exactFrame);
        if (targetFrameIndex >= this.animationFrames.length) {
            if (this.isStreaming && !this.streamingComplete) {
                if (this.streamingWaitStart === null) {
                    this.streamingWaitStart = now;
                    console.log(`Waiting for more frames... (current: ${this.animationFrames.length}, target: ${targetFrameIndex})`);
                }
                const waitTime = now - this.streamingWaitStart;
                if (waitTime > 30000) {
                    console.warn('Streaming timeout after 30s, stopping animation');
                    this.stopAnimation();
                    return;
                }
                if (waitTime > 1000 && Math.floor(waitTime / 1000) !== Math.floor((waitTime - 16) / 1000)) {
                    console.log(`Still waiting... ${Math.floor(waitTime / 1000)}s`);
                }
                requestAnimationFrame(() => this._animateFrame());
                return;
            }
            console.log(`Animation complete. Total frames: ${this.animationFrames.length}`);
            this.stopAnimation();
            return;
        }
@ -152,6 +240,11 @@ class BlendShapeAnimator {
            : Object.keys(currentBlendShapes);
        for (const key of shapeNames) {
            // 跳过禁用列表中的 blendshape，让空闲动画继续控制它们
            if (this.disabledShapesInAnimation.includes(key.toLowerCase())) {
                continue;
            }
            const currentValue = currentBlendShapes[key] || 0;
            const nextValue = nextBlendShapes[key] || 0;
            const interpolatedValue = this._lerp(currentValue, nextValue, smoothProgress);
@ -357,6 +450,38 @@ class BlendShapeAnimator {
        });
    }
    _immediateResetEyeLook() {
        const eyeLookShapes = [
            'eyelookupleft', 'eyelookupright',
            'eyelookdownleft', 'eyelookdownright',
            'eyelookinleft', 'eyelookinright',
            'eyelookoutleft', 'eyelookoutright'
        ];
        if (!this.morphTargetAdapter) return;
        eyeLookShapes.forEach(name => {
            this.morphTargetAdapter.setInfluence(name, 0);
            delete this.idleAnimations[name];
        });
    }
    _immediateResetExpressions() {
        if (!this.morphTargetAdapter || !window.ExpressionLibrary) return;
        // 获取所有表情的 blendshape 名称并立即重置
        const expressions = window.ExpressionLibrary.expressions;
        for (const exprKey in expressions) {
            const expr = expressions[exprKey];
            if (expr.shapes) {
                for (const shapeName in expr.shapes) {
                    this.morphTargetAdapter.setInfluence(shapeName, 0);
                    delete this.idleAnimations[shapeName];
                }
            }
        }
    }
    // 随机表情控制
    toggleRandomExpression(enabled) {
        if (!window.ExpressionLibrary) {
--- a/examples/3d/index.html
+++ b/examples/3d/index.html
@ -30,6 +30,13 @@
            <input type="text" id="apiUrl" value="http://localhost:5001/text-to-blendshapes">
        </div>
        <div class="input-group toggle-group">
            <label>
                <input type="checkbox" id="streamEnabled" checked>
                启用流式传输
            </label>
        </div>
        <div class="input-group">
            <label>形态键强度: <span id="scaleValue">1.0</span></label>
            <input type="range" id="scaleSlider" min="0" max="2" step="0.1" value="1.0"
--- a/examples/3d/main.js
+++ b/examples/3d/main.js
@ -60,6 +60,7 @@ async function generateAnimation() {
    const text = document.getElementById('textInput').value.trim();
    const apiUrl = document.getElementById('apiUrl').value;
    const btn = document.getElementById('generateBtn');
    const streamEnabled = document.getElementById('streamEnabled')?.checked;
    if (!text) {
        showStatus("请输入文字", "error");
@ -70,22 +71,11 @@ async function generateAnimation() {
    showStatus("生成中...", "info");
    try {
-        const response = await fetch(apiUrl, {
+        if (streamEnabled) {
-            method: 'POST',
+            await generateAnimationStream(text, apiUrl);
-            headers: { 'Content-Type': 'application/json' },
+        } else {
-            body: JSON.stringify({ text: text, language: 'zh-CN' })
+            await generateAnimationBatch(text, apiUrl);
        });
        const data = await response.json();
        if (!response.ok || !data.success) {
            throw new Error(data.error || '请求失败');
        }
        animator.loadAnimationFrames(data.frames);
        console.log("动画数据:", data.frames);
        showStatus(`动画生成成功！共 ${data.frames.length} 帧`, "success");
    } catch (err) {
        showStatus("错误: " + err.message, "error");
    } finally {
@ -93,6 +83,190 @@ async function generateAnimation() {
    }
 }
 async function generateAnimationBatch(text, apiUrl) {
    const url = normalizeApiUrl(apiUrl, false);
    const response = await fetch(url, {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ text: text, language: 'zh-CN', first_sentence_split_size: 1 })
    });
    const data = await response.json();
    if (!response.ok || !data.success) {
        throw new Error(data.error || '请求失败');
    }
    animator.loadAnimationFrames(data.frames);
    console.log("动画数据:", data.frames);
    showStatus(`动画生成成功！共 ${data.frames.length} 帧`, "success");
 }
 async function generateAnimationStream(text, apiUrl) {
    const url = normalizeApiUrl(apiUrl, true);
    const response = await fetch(url, {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ text: text, language: 'zh-CN', first_sentence_split_size: 1 })
    });
    if (!response.ok) {
        let errorMessage = `请求失败 (${response.status})`;
        try {
            const data = await response.json();
            if (data?.error) {
                errorMessage = data.error;
            }
        } catch (err) {
            // ignore json parse errors
        }
        throw new Error(errorMessage);
    }
    if (!response.body) {
        await generateAnimationBatch(text, apiUrl);
        return;
    }
    animator.stopAnimation();
    animator.loadAnimationFrames([]);
    animator.startStreaming();
    const reader = response.body.getReader();
    const decoder = new TextDecoder();
    let buffer = '';
    let started = false;
    const pendingFrames = [];
    const streamBufferMs = 300;
    const flushBatchMs = 50;
    const minStartFrames = Math.max(1, Math.round(animator.dataFps * (streamBufferMs / 1000)));
    const frameBatchSize = Math.max(1, Math.round(animator.dataFps * (flushBatchMs / 1000)));
    const flushFrames = (force = false) => {
        if (pendingFrames.length === 0) {
            return;
        }
        if (!force && pendingFrames.length < frameBatchSize) {
            return;
        }
        const framesToFlush = pendingFrames.splice(0, pendingFrames.length);
        animator.appendAnimationFrames(framesToFlush);
        console.log(`Flushed ${framesToFlush.length} frames, total: ${animator.animationFrames.length}`);
        if (!started && animator.animationFrames.length >= minStartFrames) {
            console.log(`Starting animation with ${animator.animationFrames.length} frames (min: ${minStartFrames})`);
            animator.playAnimation();
            started = true;
        }
    };
    const handleMessage = (message) => {
        if (message.type === 'frame') {
            pendingFrames.push(message.frame);
            flushFrames();
            return;
        }
        if (message.type === 'status') {
            const stageMessage = message.message || 'Streaming';
            showStatus(stageMessage, 'info');
            console.log('Stream status:', message);
            return;
        }
        if (message.type === 'error') {
            throw new Error(message.message || 'Streaming error');
        }
        if (message.type === 'end') {
            console.log('Stream ended, flushing remaining frames');
            flushFrames(true);
            animator.endStreaming();
            if (!started && animator.animationFrames.length > 0) {
                animator.playAnimation();
                started = true;
            }
            const totalFrames = message.frames ?? animator.animationFrames.length;
            console.log(`Total frames received: ${totalFrames}, in animator: ${animator.animationFrames.length}`);
            showStatus(`流式动画接收完成，共 ${totalFrames} 帧`, "success");
        }
    };
    let streamError = null;
    try {
        while (true) {
            const { value, done } = await reader.read();
            if (done) {
                break;
            }
            buffer += decoder.decode(value, { stream: true });
            const lines = buffer.split('\n');
            buffer = lines.pop() || '';
            for (const line of lines) {
                if (!line.trim()) {
                    continue;
                }
                let message;
                try {
                    message = JSON.parse(line);
                } catch (err) {
                    continue;
                }
                handleMessage(message);
            }
        }
        if (buffer.trim()) {
            try {
                handleMessage(JSON.parse(buffer));
            } catch (err) {
                // ignore trailing parse errors
            }
        }
    } catch (err) {
        streamError = err;
        throw err;
    } finally {
        if (!streamError) {
            flushFrames(true);
            if (!started && animator.animationFrames.length > 0) {
                animator.playAnimation();
            }
        }
        animator.endStreaming();
    }
 }
 function normalizeApiUrl(apiUrl, streamEnabled) {
    if (!apiUrl) {
        return apiUrl;
    }
    const trimmed = apiUrl.replace(/\/+$/, '');
    const basePath = '/text-to-blendshapes';
    const streamPath = `${basePath}/stream`;
    if (streamEnabled) {
        if (trimmed.endsWith(streamPath)) {
            return trimmed;
        }
        if (trimmed.endsWith(basePath)) {
            return trimmed + '/stream';
        }
        return trimmed + streamPath;
    }
    if (trimmed.endsWith(streamPath)) {
        return trimmed.slice(0, -'/stream'.length);
    }
    if (trimmed.endsWith(basePath)) {
        return trimmed;
    }
    return trimmed + basePath;
 }
 function playAnimation() {
    animator.playAnimation();
 }
--- a/examples/3d/styles.css
+++ b/examples/3d/styles.css
@ -59,6 +59,21 @@ body {
    color: #aaa;
 }
 .toggle-group label {
    display: flex;
    align-items: center;
    gap: 8px;
    margin-bottom: 0;
    color: #ddd;
 }
 .toggle-group input[type="checkbox"] {
    width: 16px;
    height: 16px;
    margin: 0;
    cursor: pointer;
 }
 input[type="text"],
 textarea {
    width: 100%;
--- a/services/a2f_api/README.md
+++ b/services/a2f_api/README.md
@ -55,6 +55,23 @@ python api.py
 }
 ```
 ### POST /text-to-blendshapes/stream
 **说明:** 使用 NDJSON 流式返回，便于边收边播放。
 **响应:** 每行一个 JSON 对象，`type` 字段取值如下:
 - `status` - 阶段提示
 - `frame` - 单帧数据
 - `end` - 完成信息
 - `error` - 错误信息
 **示例:**
 ```json
 {"type":"status","stage":"tts","message":"Generating audio"}
 {"type":"frame","frame":{"timeCode":0.0,"blendShapes":{"JawOpen":0.1}}}
 {"type":"end","frames":900,"audio_path":"...","csv_path":"..."}
 ```
 ## 文件说明
 - `tts_service.py` - 文字转音频服务
--- a/services/a2f_api/pycache/api.cpython-311.pyc
+++ b/services/a2f_api/pycache/api.cpython-311.pyc
--- a/services/a2f_api/pycache/blend_shape_parser.cpython-311.pyc
+++ b/services/a2f_api/pycache/blend_shape_parser.cpython-311.pyc
--- a/services/a2f_api/pycache/text_to_blendshapes_service.cpython-311.pyc
+++ b/services/a2f_api/pycache/text_to_blendshapes_service.cpython-311.pyc
--- a/services/a2f_api/pycache/tts_service.cpython-311.pyc
+++ b/services/a2f_api/pycache/tts_service.cpython-311.pyc
--- a/services/a2f_api/api.py
+++ b/services/a2f_api/api.py
@ -1,34 +1,73 @@
-from flask import Flask, request, jsonify
+import json
-from flask_cors import CORS
+from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 from text_to_blendshapes_service import TextToBlendShapesService
-app = Flask(__name__)
+app = FastAPI()
 CORS(app)
-@app.route('/health', methods=['GET'])
+app.add_middleware(
-def health():
+    CORSMiddleware,
-    return jsonify({'status': 'ok'})
+    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
-@app.route('/text-to-blendshapes', methods=['POST'])
+class TextRequest(BaseModel):
-def text_to_blendshapes():
+    text: str
    language: str = 'zh-CN'
    segment: bool = False
    split_punctuations: str = None
    max_sentence_length: int = None
    first_sentence_split_size: int = None
@app.get('/health')
 async def health():
    return {'status': 'ok'}
@app.post('/text-to-blendshapes')
 async def text_to_blendshapes(request: TextRequest):
    try:
-        data = request.get_json()
+        service = TextToBlendShapesService(lang=request.language)
-        if not data or 'text' not in data:
+        result = service.text_to_blend_shapes(
-            return jsonify({'success': False, 'error': 'Missing text'}), 400
+            request.text,
-
+            segment=request.segment,
-        text = data['text']
+            split_punctuations=request.split_punctuations,
-        language = data.get('language', 'zh-CN')
+            max_sentence_length=request.max_sentence_length
-
+        )
-        service = TextToBlendShapesService(lang=language)
+        return result
        result = service.text_to_blend_shapes(text)
        return jsonify(result)
    except Exception as e:
        import traceback
        traceback.print_exc()
-        return jsonify({'success': False, 'error': str(e)}), 500
+        return {'success': False, 'error': str(e)}
@app.post('/text-to-blendshapes/stream')
 async def text_to_blendshapes_stream(request: TextRequest):
    async def generate():
        service = TextToBlendShapesService(lang=request.language)
        try:
            for message in service.iter_text_to_blend_shapes_stream(
                request.text,
                split_punctuations=request.split_punctuations,
                max_sentence_length=request.max_sentence_length,
                first_sentence_split_size=request.first_sentence_split_size
            ):
                yield json.dumps(message) + "\n"
        except Exception as e:
            yield json.dumps({'type': 'error', 'message': str(e)}) + "\n"
    return StreamingResponse(
        generate(),
        media_type='application/x-ndjson',
        headers={
            'Cache-Control': 'no-cache',
            'X-Accel-Buffering': 'no'
        }
    )
 if __name__ == '__main__':
    import uvicorn
    print("Text to BlendShapes API: http://localhost:5001")
-    app.run(host='0.0.0.0', port=5001, debug=True)
+    uvicorn.run(app, host='0.0.0.0', port=5001)
--- a/services/a2f_api/blend_shape_parser.py
+++ b/services/a2f_api/blend_shape_parser.py
@ -17,9 +17,11 @@ class BlendShapeParser:
    @staticmethod
    def csv_to_blend_shapes(csv_path: str):
-        frames = []
+        return list(BlendShapeParser.iter_csv_to_blend_shapes(csv_path))
    @staticmethod
    def iter_csv_to_blend_shapes(csv_path: str):
        with open(csv_path, 'r') as f:
            reader = csv.DictReader(f)
            for row in reader:
                frame = {'timeCode': float(row['timeCode']), 'blendShapes': {}}
@ -27,5 +29,4 @@ class BlendShapeParser:
                    col_name = f'blendShapes.{key}'
                    if col_name in row:
                        frame['blendShapes'][key] = float(row[col_name])
-                frames.append(frame)
+                yield frame
        return frames
--- a/services/a2f_api/test.html
+++ b/services/a2f_api/test.html
@ -0,0 +1,282 @@
 <!DOCTYPE html>
 <html lang="zh-CN">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Text to BlendShapes 测试</title>
    <style>
        * { margin: 0; padding: 0; box-sizing: border-box; }
        body {
            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            min-height: 100vh;
            padding: 20px;
        }
        .container {
            max-width: 800px;
            margin: 0 auto;
            background: white;
            border-radius: 12px;
            padding: 30px;
            box-shadow: 0 20px 60px rgba(0,0,0,0.3);
        }
        h1 {
            color: #333;
            margin-bottom: 10px;
            font-size: 28px;
        }
        .subtitle {
            color: #666;
            margin-bottom: 30px;
            font-size: 14px;
        }
        .input-group {
            margin-bottom: 20px;
        }
        label {
            display: block;
            margin-bottom: 8px;
            color: #555;
            font-weight: 500;
        }
        input, textarea, select {
            width: 100%;
            padding: 12px;
            border: 2px solid #e0e0e0;
            border-radius: 6px;
            font-size: 14px;
            transition: border-color 0.3s;
        }
        input:focus, textarea:focus, select:focus {
            outline: none;
            border-color: #667eea;
        }
        textarea {
            resize: vertical;
            min-height: 100px;
            font-family: inherit;
        }
        button {
            width: 100%;
            padding: 14px;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            border: none;
            border-radius: 6px;
            font-size: 16px;
            font-weight: 600;
            cursor: pointer;
            transition: transform 0.2s, box-shadow 0.2s;
        }
        button:hover {
            transform: translateY(-2px);
            box-shadow: 0 10px 20px rgba(102, 126, 234, 0.3);
        }
        button:active {
            transform: translateY(0);
        }
        button:disabled {
            opacity: 0.6;
            cursor: not-allowed;
            transform: none;
        }
        .loading {
            display: none;
            text-align: center;
            margin: 20px 0;
            color: #667eea;
        }
        .loading.show {
            display: block;
        }
        .result {
            margin-top: 30px;
            padding: 20px;
            background: #f8f9fa;
            border-radius: 6px;
            display: none;
        }
        .result.show {
            display: block;
        }
        .result h3 {
            color: #333;
            margin-bottom: 15px;
        }
        .stats {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
            gap: 15px;
            margin-bottom: 20px;
        }
        .stat-card {
            background: white;
            padding: 15px;
            border-radius: 6px;
            text-align: center;
        }
        .stat-value {
            font-size: 24px;
            font-weight: bold;
            color: #667eea;
        }
        .stat-label {
            font-size: 12px;
            color: #666;
            margin-top: 5px;
        }
        .frames-preview {
            max-height: 300px;
            overflow-y: auto;
            background: white;
            padding: 15px;
            border-radius: 6px;
            font-family: monospace;
            font-size: 12px;
        }
        .error {
            background: #fee;
            color: #c33;
            padding: 15px;
            border-radius: 6px;
            margin-top: 20px;
            display: none;
        }
        .error.show {
            display: block;
        }
    </style>
 </head>
 <body>
    <div class="container">
        <h1>Text to BlendShapes</h1>
        <p class="subtitle">将文字转换为 52 个 ARKit 形态键</p>
        <div class="input-group">
            <label for="text">输入文字</label>
            <textarea id="text" placeholder="请输入要转换的文字...">你好世界，这是一个测试。</textarea>
        </div>
        <div class="input-group">
            <label for="language">语言</label>
            <select id="language">
                <option value="zh-CN">中文</option>
                <option value="en">English</option>
                <option value="ja">日本語</option>
                <option value="ko">한국어</option>
            </select>
        </div>
        <div class="input-group">
            <label for="apiUrl">API 地址</label>
            <input type="text" id="apiUrl" value="http://localhost:5001/text-to-blendshapes">
        </div>
        <button id="submitBtn" onclick="convert()">转换</button>
        <div class="loading" id="loading">
            <p>⏳ 处理中，请稍候...</p>
        </div>
        <div class="error" id="error"></div>
        <div class="result" id="result">
            <h3>转换结果</h3>
            <div class="stats">
                <div class="stat-card">
                    <div class="stat-value" id="frameCount">0</div>
                    <div class="stat-label">总帧数</div>
                </div>
                <div class="stat-card">
                    <div class="stat-value" id="duration">0s</div>
                    <div class="stat-label">时长</div>
                </div>
                <div class="stat-card">
                    <div class="stat-value">52</div>
                    <div class="stat-label">形态键数量</div>
                </div>
            </div>
            <h4 style="margin-bottom: 10px;">帧数据预览</h4>
            <div class="frames-preview" id="framesPreview"></div>
        </div>
    </div>
    <script>
        async function convert() {
            const text = document.getElementById('text').value.trim();
            const language = document.getElementById('language').value;
            const apiUrl = document.getElementById('apiUrl').value;
            if (!text) {
                showError('请输入文字');
                return;
            }
            const submitBtn = document.getElementById('submitBtn');
            const loading = document.getElementById('loading');
            const result = document.getElementById('result');
            const error = document.getElementById('error');
            submitBtn.disabled = true;
            loading.classList.add('show');
            result.classList.remove('show');
            error.classList.remove('show');
            try {
                const response = await fetch(apiUrl, {
                    method: 'POST',
                    headers: {
                        'Content-Type': 'application/json',
                    },
                    body: JSON.stringify({
                        text: text,
                        language: language
                    })
                });
                const data = await response.json();
                if (!response.ok || !data.success) {
                    throw new Error(data.error || '请求失败');
                }
                displayResult(data);
            } catch (err) {
                showError(err.message);
            } finally {
                submitBtn.disabled = false;
                loading.classList.remove('show');
            }
        }
        function displayResult(data) {
            const result = document.getElementById('result');
            const frameCount = document.getElementById('frameCount');
            const duration = document.getElementById('duration');
            const framesPreview = document.getElementById('framesPreview');
            const frames = data.frames || [];
            frameCount.textContent = frames.length;
            if (frames.length > 0) {
                const lastFrame = frames[frames.length - 1];
                duration.textContent = lastFrame.timeCode.toFixed(2) + 's';
            }
            framesPreview.textContent = JSON.stringify(frames.slice(0, 3), null, 2);
            if (frames.length > 3) {
                framesPreview.textContent += '\n\n... 共 ' + frames.length + ' 帧';
            }
            result.classList.add('show');
        }
        function showError(message) {
            const error = document.getElementById('error');
            error.textContent = '错误: ' + message;
            error.classList.add('show');
        }
    </script>
 </body>
 </html>
--- a/services/a2f_api/text_to_blendshapes_service.py
+++ b/services/a2f_api/text_to_blendshapes_service.py
@ -1,23 +1,39 @@
 import os
 import re
 import tempfile
 import concurrent.futures
 import queue
 import threading
 from datetime import datetime
 from tts_service import TTSService
 from a2f_service import A2FService
 from blend_shape_parser import BlendShapeParser
 class TextToBlendShapesService:
    DEFAULT_SPLIT_PUNCTUATIONS = '。！？；!?;,，'
    def __init__(self, lang='zh-CN', a2f_url="192.168.1.39:52000"):
        self.tts = TTSService(lang=lang)
        self.a2f = A2FService(a2f_url=a2f_url)
        self.parser = BlendShapeParser()
-    def text_to_blend_shapes(self, text: str, output_dir: str = None):
+    def text_to_blend_shapes(
-        if output_dir is None:
+        self,
-            output_dir = tempfile.gettempdir()
+        text: str,
        output_dir: str = None,
        segment: bool = False,
        split_punctuations: str = None,
        max_sentence_length: int = None
    ):
        if segment:
            return self._text_to_blend_shapes_segmented(
                text,
                output_dir,
                split_punctuations=split_punctuations,
                max_sentence_length=max_sentence_length
            )
-        os.makedirs(output_dir, exist_ok=True)
+        output_dir, audio_path = self._prepare_output_paths(output_dir)
        timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
        audio_path = os.path.join(output_dir, f'tts_{timestamp}.wav')
        self.tts.text_to_audio(text, audio_path)
        csv_path = self.a2f.audio_to_csv(audio_path)
@ -29,3 +45,235 @@ class TextToBlendShapesService:
            'audio_path': audio_path,
            'csv_path': csv_path
        }
    def iter_text_to_blend_shapes_stream(
        self,
        text: str,
        output_dir: str = None,
        split_punctuations: str = None,
        max_sentence_length: int = None,
        first_sentence_split_size: int = None
    ):
        output_dir = output_dir or tempfile.gettempdir()
        os.makedirs(output_dir, exist_ok=True)
        sentences = self.split_sentences(
            text,
            split_punctuations=split_punctuations,
            max_sentence_length=max_sentence_length,
            first_sentence_split_size=first_sentence_split_size
        )
        if not sentences:
            yield {'type': 'error', 'message': '文本为空'}
            return
        yield {'type': 'status', 'stage': 'split', 'sentences': len(sentences), 'message': f'已拆分为 {len(sentences)} 个句子'}
        # 使用队列来收集处理完成的句子
        result_queue = queue.Queue()
        def process_and_queue(index, sentence):
            """处理句子并放入队列"""
            try:
                print(f"[工作线程 {index}] 开始处理: {sentence[:30]}...")
                frames, audio_path, csv_path = self._process_sentence(sentence, output_dir, index)
                result_queue.put((index, 'success', frames, None))
                print(f"[工作线程 {index}] 完成！已生成 {len(frames)} 帧并加入队列")
            except Exception as e:
                print(f"[工作线程 {index}] 失败: {str(e)}")
                import traceback
                traceback.print_exc()
                result_queue.put((index, 'error', None, str(e)))
        # 提交所有句子到线程池并发处理（增加并发数以加速）
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            for index, sentence in enumerate(sentences):
                executor.submit(process_and_queue, index, sentence)
            # 按顺序从队列中取出结果并推送
            completed = {}
            next_index = 0
            total_frames = 0
            cumulative_time = 0.0  # 累计时间，用于连续句子
            while next_index < len(sentences):
                # 如果下一个句子还没完成，等待队列
                if next_index not in completed:
                    yield {
                        'type': 'status',
                        'stage': 'processing',
                        'sentence_index': next_index,
                        'sentences': len(sentences),
                        'message': f'正在处理 {next_index + 1}/{len(sentences)}'
                    }
                    # 从队列中获取结果
                    while next_index not in completed:
                        try:
                            index, status, frames, error = result_queue.get(timeout=1)
                            completed[index] = (status, frames, error)
                            print(f"[主线程] 收到句子 {index} 的处理结果")
                        except queue.Empty:
                            continue
                # 推送下一个句子的帧
                status, frames, error = completed[next_index]
                if status == 'error':
                    yield {'type': 'error', 'message': f'句子 {next_index} 处理失败: {error}'}
                    return
                # 如果是连续句子，调整时间码使其无缝衔接
                is_continuation = self.is_continuation[next_index] if next_index < len(self.is_continuation) else False
                print(f"[主线程] 正在推送句子 {next_index} 的 {len(frames)} 帧 {'(连续)' if is_continuation else ''}")
                # 如果不是连续句子，重置累计时间
                if not is_continuation and next_index > 0:
                    cumulative_time = 0.0
                for frame in frames:
                    # 调整时间码：从累计时间开始
                    frame['timeCode'] = cumulative_time + frame['timeCode']
                    frame['sentenceIndex'] = next_index
                    frame['isContinuation'] = is_continuation
                    total_frames += 1
                    yield {'type': 'frame', 'frame': frame}
                # 更新累计时间为当前句子的最后一帧时间
                if frames:
                    cumulative_time = frames[-1]['timeCode']
                next_index += 1
        print(f"[主线程] 流式传输完成，共 {total_frames} 帧")
        yield {
            'type': 'end',
            'frames': total_frames
        }
    def _process_sentence(self, sentence, output_dir, index):
        """处理单个句子: TTS -> A2F -> 解析"""
        import time
        start_time = time.time()
        print(f"[线程 {index}] 开始处理: {sentence[:30]}...")
        _, audio_path = self._prepare_output_paths(output_dir, suffix=f's{index:03d}')
        print(f"[线程 {index}] TTS 开始...")
        tts_start = time.time()
        self.tts.text_to_audio(sentence, audio_path)
        tts_time = time.time() - tts_start
        print(f"[线程 {index}] TTS 完成，耗时 {tts_time:.2f}秒，A2F 开始...")
        a2f_start = time.time()
        csv_path = self.a2f.audio_to_csv(audio_path)
        a2f_time = time.time() - a2f_start
        print(f"[线程 {index}] A2F 完成，耗时 {a2f_time:.2f}秒，解析中...")
        parse_start = time.time()
        frames = list(self.parser.iter_csv_to_blend_shapes(csv_path))
        parse_time = time.time() - parse_start
        total_time = time.time() - start_time
        print(f"[线程 {index}] 完成！生成了 {len(frames)} 帧 | 总耗时: {total_time:.2f}秒 (TTS: {tts_time:.2f}s, A2F: {a2f_time:.2f}s, 解析: {parse_time:.2f}s)")
        return frames, audio_path, csv_path
    def _text_to_blend_shapes_segmented(
        self,
        text: str,
        output_dir: str = None,
        split_punctuations: str = None,
        max_sentence_length: int = None
    ):
        frames = []
        audio_paths = []
        csv_paths = []
        for message in self.iter_text_to_blend_shapes_stream(
            text,
            output_dir,
            split_punctuations=split_punctuations,
            max_sentence_length=max_sentence_length
        ):
            if message.get('type') == 'frame':
                frames.append(message['frame'])
            elif message.get('type') == 'error':
                return {
                    'success': False,
                    'error': message.get('message', 'Unknown error')
                }
            elif message.get('type') == 'end':
                audio_paths = message.get('audio_paths', [])
                csv_paths = message.get('csv_paths', [])
        return {
            'success': True,
            'frames': frames,
            'audio_paths': audio_paths,
            'csv_paths': csv_paths
        }
    def split_sentences(self, text: str, split_punctuations: str = None, max_sentence_length: int = None, first_sentence_split_size: int = None):
        """拆分句子，并对第一句进行特殊处理以加速首帧"""
        if not text:
            return []
        normalized = re.sub(r'[\r\n]+', '。', text.strip())
        punctuations = split_punctuations or self.DEFAULT_SPLIT_PUNCTUATIONS
        if punctuations:
            escaped = re.escape(punctuations)
            split_re = re.compile(rf'(?<=[{escaped}])')
            chunks = split_re.split(normalized)
        else:
            chunks = [normalized]
        sentences = [chunk.strip() for chunk in chunks if chunk.strip()]
        # 记录哪些句子是拆分的（需要连续播放）
        self.is_continuation = [False] * len(sentences)
        # 可选：拆分第一句以加速首帧（并发处理）
        if first_sentence_split_size and sentences:
            first = sentences[0]
            length = len(first)
            parts = []
            if length <= 12:
                # 12字以内分两部分
                mid = length // 2
                parts = [first[:mid], first[mid:]]
            else:
                # 12字之后：前6字，再6字，剩下的
                parts = [first[:6], first[6:12], first[12:]]
            # 替换第一句为多个小句
            sentences = parts + sentences[1:]
            # 标记后续部分为连续播放
            self.is_continuation = [False] + [True] * (len(parts) - 1) + [False] * (len(sentences) - len(parts))
            print(f"[拆分优化] 第一句({length}字)拆分为{len(parts)}部分: {[len(p) for p in parts]} - 连续播放")
        if not max_sentence_length or max_sentence_length <= 0:
            return sentences
        limited = []
        for sentence in sentences:
            if len(sentence) <= max_sentence_length:
                limited.append(sentence)
                continue
            start = 0
            while start < len(sentence):
                limited.append(sentence[start:start + max_sentence_length])
                start += max_sentence_length
        return limited
    def _prepare_output_paths(self, output_dir: str = None, suffix: str = None):
        if output_dir is None:
            output_dir = tempfile.gettempdir()
        os.makedirs(output_dir, exist_ok=True)
        timestamp = datetime.now().strftime('%Y%m%d%H%M%S%f')
        suffix_part = f'_{suffix}' if suffix else ''
        audio_path = os.path.join(output_dir, f'tts_{timestamp}{suffix_part}.wav')
        return output_dir, audio_path
--- a/services/a2f_api/tts_service.py
+++ b/services/a2f_api/tts_service.py
@ -1,20 +1,35 @@
 import os
 import threading
 import pyttsx3
 class TTSService:
    _lock = threading.Lock()
    def __init__(self, lang='zh-CN'):
        self.lang = lang
        self.engine = pyttsx3.init()
        if lang == 'zh-CN':
            voices = self.engine.getProperty('voices')
            for voice in voices:
                if 'chinese' in voice.name.lower() or 'zh' in voice.id.lower():
                    self.engine.setProperty('voice', voice.id)
                    break
    def text_to_audio(self, text: str, output_path: str) -> str:
        """将文本转换为WAV音频文件（使用pyttsx3）"""
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
-        self.engine.save_to_file(text, output_path)
+
-        self.engine.runAndWait()
+        with self._lock:
-        return output_path
+            engine = pyttsx3.init()
            try:
                # 设置中文语音
                voices = engine.getProperty('voices')
                for voice in voices:
                    if 'chinese' in voice.name.lower() or 'zh' in voice.id.lower():
                        engine.setProperty('voice', voice.id)
                        break
                # 设置语速
                engine.setProperty('rate', 150)
                # 保存为WAV
                engine.save_to_file(text, output_path)
                engine.runAndWait()
                return output_path
            finally:
                engine.stop()
                del engine
--- a/test_tts.py
+++ b/test_tts.py
@ -0,0 +1,13 @@
 import requests
 import json
 url = "http://localhost:5001/text-to-blendshapes"
 data = {
    "text": "你好",
    "language": "zh-CN"
 }
 print("发送测试请求...")
 response = requests.post(url, json=data)
 print(f"状态码: {response.status_code}")
 print(f"响应: {response.json()}")
--- a/需求.md
+++ b/需求.md
@ -0,0 +1,176 @@
 我现在已经跑通了a2f，能够将音频转化成csv格式文件，我想让你帮我实现一个项目：用py写实现两个函数，文字转音频文件/音
 频文件转csv文件/csv文件转52个形态键/ 最后在暴露出一个文字输入的接口，输出52个形态键的数据
 python_services项目你可以作为参考，# Babylon.js + A2F 低延迟实时嘴型方案设计文档
 ## 1. 文档目的
 本文档用于指导在 **Babylon.js（Web）** 环境下，基于 **Audio2Face（A2F）** 实现“尽可能低延迟”的数字人嘴型驱动方案。目标并非严格意义上的零延迟实时，而是在 Web 约束下实现 **准实时（400–600ms 首帧延迟）** 且稳定可上线的工程方案。
 ---
 ## 2. 设计约束与前提
 ### 2.1 技术约束
 * A2F 本身为 **非严格流式模型**，需要一定音频前瞻（lookahead）
 * Babylon.js MorphTarget 为 **CPU 驱动 + GPU 顶点更新**，性能敏感
 ## 3. 总体方案概述
 ### 3.1 核心思想
 * **文本按句拆分**，缩短首帧等待时间
 * **句级流水线处理**，而非整段阻塞
 * **音频与嘴型数据流式推送**
 * 前端仅负责 **插值播放**，不做重计算
 ### 3.2 总体架构
 ```
 文本输入
  ↓
 句子拆分（强停顿标点）
  ↓
 句子队列（Pipeline）
  ↓
 ┌──────────────┐
 │ 流式 TTS     │
 └──────────────┘
  ↓ PCM chunk
 ┌──────────────┐
 │ A2F（句级）  │
 └──────────────┘
  ↓ BlendShape Frames
 ┌──────────────┐
 │ 二进制传输   │
 └──────────────┘
  ↓
 Babylon.js 插值播放
 ```
 ---
 ## 4. 文本拆句策略
 ### 4.1 拆分规则
 * 仅按 **强停顿标点** 拆分：
  * `。` `！` `？` `；`
 ### 4.3 句尾处理
 * 每句音频结尾 **补 150–300ms 静音**
 * 用于嘴型自然回到 neutral
 ---
 ## 5. 后端处理流程
 ### 5.1 流水线调度
 * 同时最多处理 **2–3 句**
 * 始终保证：
  * 当前句播放中
  * 下一句已 ready
 ### 5.2 TTS 要求
 * 必须支持 **流式 / chunk 输出**
 * chunk 大小建议：100–200ms PCM
 ### 5.3 A2F 调用策略
 * 不等待整段文本
 * 以 **句为最小单元**调用
 ---
 ## 6. 数据格式设计（替代 CSV）
 ### 6.1 为什么不用 CSV
 * 文本解析慢
 * 数据冗余大
 * 不支持流式 append
 ### 6.2 推荐二进制结构
 ```
 Frame {
  uint16 timestamp_ms;
  uint8  shape_count;
  uint8  shape_indices[shape_count];
  int8   shape_values[shape_count]; // -127 ~ 127
 }
 ```
 ### 6.3 优点
 * 数据量减少 60–80%
 * WebSocket 直传
 * JS 解析成本极低
 ---
 ## 7. 前端（Babylon.js）播放方案
 ### 7.1 核心原则
 * **不用 onBeforeRender 逐帧 setInfluence**
 * 使用 **Animation / AnimationGroup**
 * 前端只负责：
  * buffer
  * 时间对齐
  * 插值
 ### 7.2 帧率与形态键
 | 项目           | 建议        |
 | ------------ | --------- |
 | 嘴型帧率         | 15–20 fps |
 | 形态键          | 20–30 个   |
 | Morph Normal | 关闭        |
 ### 7.3 句间过渡
 * 句尾：morph → neutral（100ms lerp）
 * 句首：neutral → first frame（100ms fade in）
 ---
 ## 8. 延迟评估
 | 环节        | 典型延迟       |
 | --------- | ---------- |
 | 流式 TTS    | 100–200 ms |
 | A2F 计算    | 200–300 ms |
 | 网络        | 20–50 ms   |
 | 前端 buffer | ~100 ms    |
 **总首帧延迟：≈ 400–600 ms**
 ---
 ## 9. 风险与边界
 * A2F 不适合 <200ms 的强实时场景
 * 高并发时需限流（TTS / A2F GPU）
 * 超长文本必须强制拆句
 ---
 ## 10. 结论
 * **拆句 + 流水线** 是 Web + A2F 的最优解
 * CSV 必须淘汰，二进制流是必选项
 * 在 Babylon.js 中可实现稳定、可上线的准实时数字人嘴型系统
 ---
 正常流式传输逻辑是后端将文字拆分成句，每个句调用TTS生成音频，再调用A2F生成blendshape数据，最后将blendshape数据发送给前端。要有并发处理能力，能够同时处理多句。优先将当前句的blendshape数据发送给前端，等下一句blendshape数据 ready 后再发送。FLASK后端可以使用异步处理，用FlaskAPI库的asyncio支持。前端可以使用WebSocket接收blendshape数据，实时播放。
--- a/需求.txt
+++ b/需求.txt
@ -1,3 +0,0 @@
 我现在已经跑通了a2f，能够将音频转化成csv格式文件，我想让你帮我实现一个项目：用py写实现两个函数，文字转音频文件/音
 频文件转csv文件/csv文件转52个形态键/ 最后在暴露出一个文字输入的接口，输出52个形态键的数据
 python_services项目你可以作为参考，