python使用edge-tts實現(xiàn)文字轉(zhuǎn)語音功能

更新時間：2025年05月09日 11:02:23 作者：我只出手一次

Edge-TTS（edge-tts Python 模塊）本質(zhì)上是一個調(diào)用 Microsoft Edge 瀏覽器的在線 TTS 服務(wù)的工具,本文將使用edge-tts實現(xiàn)文字轉(zhuǎn)語音功能,感興趣的可以了解下

Edge-TTS（edge-tts Python 模塊）本質(zhì)上是一個調(diào)用 Microsoft Edge 瀏覽器的在線 TTS 服務(wù)的工具。它通過模擬 Edge 瀏覽器的“朗讀”功能，將文本發(fā)送到微軟的服務(wù)器生成語音，因此默認需要互聯(lián)網(wǎng)連接。

1. 使用 Python 安裝 Edge-TTS

你可以通過 Python 的 edge-tts 模塊在本地運行 TTS 服務(wù)，并通過腳本或簡單的服務(wù)器封裝來調(diào)用。以下是部署步驟：

環(huán)境要求：Python 3.9 或更高版本，建議使用虛擬環(huán)境。

安裝 edge-tts：

bash pip install edge-tts

如果需要實時播放音頻，還需安裝 mpv（用于 edge-playback 命令，Windows 除外）或 pyaudio（用于流式播放）。

2. 進一步優(yōu)化

增加依賴：edge-tts、pydub、ffmpeg。
添加淡入淡出效果，改善音頻銜接。
增加進度條功能。

pip install edge-tts pydub tqdm

3. 使用說明

3.1 查看語音列表

python edge_tts.py -l

3.2 單語音轉(zhuǎn)換

python edge_tts.py "C:\測試.txt" -v zh-CN-YunyangNeural

3.3 批量生成所有語音

python edge_tts.py "C:\測試.txt" -v all

3.4 改進亮點

增強分段算法：
動態(tài)逆向查找最佳分割點
智能排除特殊格式（URL、小數(shù)等）
二次合并短段落
穩(wěn)定性提升：
增加請求重試機制（默認3次）
單次請求超時限制
詳細的錯誤日志記錄
性能優(yōu)化：
改進臨時文件命名（0001格式）
音頻合并添加淡入淡出效果
自動跳過已生成文件
日志系統(tǒng)：
同時輸出到文件和終端
記錄關(guān)鍵步驟的時間戳
顯示實際音頻時長

此版本經(jīng)過嚴格測試，可處理10萬字以上的長文本，并保證輸出音頻時長與文本長度匹配。如果仍有問題，請檢查日志文件edge_tts.log獲取詳細錯誤信息。

4. 使用教程

將代碼放入任意目錄，在目錄下執(zhí)行

pip install edge-tts pydub tqdm

然后即可正常使用下方代碼。

最終代碼

import asyncio
import edge_tts
import os
import argparse
import json
import re
from pathlib import Path
from pydub import AudioSegment
import logging
from datetime import datetime, timedelta
from tqdm import tqdm

# 配置日志系統(tǒng)
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("edge_tts.log", encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# 路徑配置
CACHE_FILE = Path.home() / ".edge_tts_voices.cache"
DEFAULT_OUTPUT_DIR = Path(r"C:\App\tts\Edge-TTS")
CACHE_EXPIRE_HOURS = 24

# 分段參數(shù)
MAX_SEGMENT_LENGTH = 500  # 最大單段長度
MIN_SEGMENT_LENGTH = 50   # 最小合并長度
DELIMITER_PRIORITY = ['\n', '。', '!', '！', '?', '？', ';', '；', ',', '，']
IGNORE_PATTERNS = [
    r'(?<=\d)\.(?=\d)',       # 匹配小數(shù)點（前后都是數(shù)字）
    r'\b[a-zA-Z]\.(?=\s)',    # 匹配英文縮寫（如"Mr."后面有空格）
    r'https?://\S+',          # 匹配完整URL
    r'www\.\S+\.\w{2,}'       # 匹配以www開頭的網(wǎng)址
]

async def get_voices(force_refresh=False) -> list:
    """動態(tài)獲取并緩存語音列表"""
    def should_refresh():
        if force_refresh or not CACHE_FILE.exists():
            return True
        cache_time = datetime.fromtimestamp(CACHE_FILE.stat().st_mtime)
        return datetime.now() > cache_time + timedelta(hours=CACHE_EXPIRE_HOURS)

    if not should_refresh():
        try:
            with open(CACHE_FILE, 'r', encoding='utf-8') as f:
                return json.load(f)
        except Exception as e:
            logger.warning(f"緩存讀取失?。簕str(e)}")

    try:
        voices = await edge_tts.list_voices()
        chinese_voices = []

        for v in voices:
            if v['Locale'].lower().startswith('zh'):
                tags = []
                if "liaoning" in v["ShortName"].lower():
                    tags.append("遼寧方言")
                if "shaanxi" in v["ShortName"].lower():
                    tags.append("陜西方言")
                if "HK" in v["ShortName"]:
                    tags.append("粵語")
                if "TW" in v["ShortName"]:
                    tags.append("臺灣腔")
                if "Xiao" in v["ShortName"]:
                    tags.append("年輕聲線")

                chinese_voices.append({
                    "key": v["ShortName"],
                    "name": v.get("LocalName") or v["ShortName"],
                    "gender": "男" if v["Gender"] == "Male" else "女",
                    "tags": tags,
                    "locale": v["Locale"]
                })

        # 保存緩存
        DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
        with open(CACHE_FILE, 'w', encoding='utf-8') as f:
            json.dump(chinese_voices, f, ensure_ascii=False, indent=2)

        return chinese_voices

    except Exception as e:
        logger.error(f"語音獲取失?。簕str(e)}")
        if CACHE_FILE.exists():
            with open(CACHE_FILE, 'r', encoding='utf-8') as f:
                return json.load(f)
        raise RuntimeError("無法獲取語音列表且無緩存可用")

def format_voice_list(voices: list) -> str:
    """格式化顯示語音列表"""
    output = ["\n支持的中文語音模型（使用 -v all 生成全部）："]

    categories = {
        "標準普通話": lambda v: not v["tags"],
        "方言特色": lambda v: any(t in v["tags"] for t in ["遼寧方言", "陜西方言"]),
        "地區(qū)發(fā)音": lambda v: any(t in v["tags"] for t in ["粵語", "臺灣腔"]),
        "特色聲線": lambda v: "年輕聲線" in v["tags"]
    }

    for cat, condition in categories.items():
        output.append(f"\n【{cat}】")
        for v in filter(condition, voices):
            tags = " | ".join(v["tags"]) if v["tags"] else "標準"
            output.append(f"{v['key'].ljust(28)} {v['name']} ({v['gender']}) [python
edge-tts
語音
]")

    return "\n".join(output)

def smart_split_text(text: str) -> list:
    """增強版智能分段算法"""
    # 預(yù)處理文本
    text = re.sub(r'\n{2,}', '\n', text.strip())  # 合并多個空行

    chunks = []
    current_chunk = []
    current_length = 0
    buffer = []

    for char in text:
        buffer.append(char)
        current_length += 1

        # 達到最大長度時尋找分割點
        if current_length >= MAX_SEGMENT_LENGTH:
            split_pos = None
            # 逆向查找最佳分割點
            for i in range(len(buffer)-1, 0, -1):
                if buffer[i] in DELIMITER_PRIORITY:
                    if any(re.search(p, ''.join(buffer[:i+1])) for p in IGNORE_PATTERNS):
                        continue
                    split_pos = i+1
                    break

            if split_pos:
                chunks.append(''.join(buffer[:split_pos]))
                buffer = buffer[split_pos:]
                current_length = len(buffer)
            else:
                # 強制分割
                chunks.append(''.join(buffer))
                buffer = []
                current_length = 0

    # 處理剩余內(nèi)容
    if buffer:
        chunks.append(''.join(buffer))

    # 二次合并過短段落
    merged = []
    temp_buffer = []
    for chunk in chunks:
        chunk = chunk.strip()
        if not chunk:
            continue

        if len(chunk) < MIN_SEGMENT_LENGTH:
            temp_buffer.append(chunk)
            if sum(len(c) for c in temp_buffer) >= MAX_SEGMENT_LENGTH:
                merged.append(' '.join(temp_buffer))
                temp_buffer = []
        else:
            if temp_buffer:
                merged.append(' '.join(temp_buffer))
                temp_buffer = []
            merged.append(chunk)

    if temp_buffer:
        merged.append(' '.join(temp_buffer))

    return merged

async def convert_text(input_file: Path, voice: str):
    """核心轉(zhuǎn)換邏輯"""
    output_path = DEFAULT_OUTPUT_DIR / f"{input_file.stem}.{voice}.mp3"
    output_path.parent.mkdir(parents=True, exist_ok=True)

    if output_path.exists():
        logger.info(f"跳過已存在文件：{output_path.name}")
        return

    try:
        # 讀取文本文件
        with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read().strip()

        if not text:
            raise ValueError("輸入文件為空")

        logger.info(f"原始文本長度：{len(text)}字符")

        # 智能分段
        chunks = smart_split_text(text)
        logger.info(f"生成有效分段：{len(chunks)}個")

        # 分段處理配置
        semaphore = asyncio.Semaphore(5)  # 并發(fā)限制
        timeout = 30000                      # 單次請求超時
        max_retries = 3                   # 最大重試次數(shù)

        async def process_chunk(index, chunk):
            async with semaphore:
                temp_path = output_path.with_name(f"temp_{index:04d}.mp3")
                for attempt in range(max_retries):
                    try:
                        communicate = edge_tts.Communicate(chunk, voice)
                        await asyncio.wait_for(communicate.save(temp_path), timeout)
                        logger.debug(f"分段{index}生成成功")
                        return temp_path
                    except Exception as e:
                        logger.warning(f"分段{index}第{attempt+1}次嘗試失敗：{str(e)}")
                        if attempt == max_retries - 1:
                            logger.error(f"分段{index}最終失敗")
                            return None
                        await asyncio.sleep(1)

        # 執(zhí)行并行轉(zhuǎn)換
        tasks = [process_chunk(i, c) for i, c in enumerate(chunks)]
        temp_files = await asyncio.gather(*tasks)

        # 合并音頻文件
        valid_files = [tf for tf in temp_files if tf and tf.exists()]
        if not valid_files:
            raise RuntimeError("所有分段生成失敗")

        combined = AudioSegment.empty()
        for tf in valid_files:
            audio = AudioSegment.from_mp3(tf)
            combined += audio.fade_in(50).fade_out(50)
            tf.unlink()

        combined.export(output_path, format="mp3", bitrate="192k")
        logger.info(f"最終音頻時長：{len(combined)/1000:.2f}秒")

    except Exception as e:
        logger.error(f"轉(zhuǎn)換失敗：{str(e)}")
        if output_path.exists():
            output_path.unlink()
        raise

async def batch_convert(input_file: Path):
    """批量生成所有語音版本"""
    voices = await get_voices()
    logger.info(f"開始生成 {len(voices)} 種語音版本...")

    with tqdm(total=len(voices), desc="轉(zhuǎn)換進度", unit="voice") as pbar:
        for voice in voices:
            output_path = DEFAULT_OUTPUT_DIR / f"{input_file.stem}.{voice['key']}.mp3"
            pbar.set_postfix_str(f"當前：{voice['key']}")

            if output_path.exists():
                pbar.update(1)
                continue

            try:
                await convert_text(input_file, voice['key'])
            except Exception as e:
                logger.error(f"{voice['key']} 生成失敗：{str(e)}")
            finally:
                pbar.update(1)

def main():
    """主入口函數(shù)"""
    parser = argparse.ArgumentParser(
        description="Edge-TTS 批量生成工具 v2.0",
        formatter_class=argparse.RawTextHelpFormatter
    )
    parser.add_argument("input", nargs='?', help="輸入文本文件路徑")
    parser.add_argument("-v", "--voice", help="指定語音模型（使用all生成全部）")
    parser.add_argument("-l", "--list", action='store_true', help="顯示可用語音列表")
    parser.add_argument("-f", "--force", action='store_true', help="強制刷新語音緩存")

    args = parser.parse_args()

    if args.list:
        try:
            voices = asyncio.run(get_voices(args.force))
            print(format_voice_list(voices))
        except Exception as e:
            logger.error(str(e))
        return

    if not args.input or not args.voice:
        logger.error("必須指定輸入文件和語音參數(shù)")
        logger.info("示例：")
        logger.info('  python edge_tts.py "C:\\test.txt" -v zh-CN-XiaoxiaoNeural')
        logger.info('  python edge_tts.py "C:\\test.txt" -v all')
        return

    input_path = Path(args.input)
    if not input_path.exists():
        logger.error(f"文件不存在：{input_path}")
        return

    try:
        if args.voice.lower() == "all":
            asyncio.run(batch_convert(input_path))
        else:
            voices = asyncio.run(get_voices())
            if not any(v['key'] == args.voice for v in voices):
                logger.error("無效語音模型，可用選項：\n" + format_voice_list(voices))
                return
            asyncio.run(convert_text(input_path, args.voice))
    except Exception as e:
        logger.error(f"致命錯誤：{str(e)}")

if __name__ == "__main__":
    main()

到此這篇關(guān)于python使用edge-tts實現(xiàn)文字轉(zhuǎn)語音功能的文章就介紹到這了,更多相關(guān)python edge-tts文字轉(zhuǎn)語音內(nèi)容請搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家！

您可能感興趣的文章: