feat 添加风格支持及首页

2024-10-18 21:06:38 +08:00
parent d7280f7fbb
commit 62883ccb99
5 changed files with 194 additions and 35 deletions
--- a/handlers/handlers.go
+++ b/handlers/handlers.go
@@ -50,7 +50,7 @@ func SynthesizeVoice(c *gin.Context) {
 	pitch := c.DefaultQuery("p", "0")
 	outputFormat := c.DefaultQuery("o", "audio-24khz-48kbitrate-mono-mp3")
-	voice, err := utils.GetVoice(text, voiceName, rate, pitch, outputFormat)
+	voice, err := utils.GetVoice(text, voiceName, rate, pitch, outputFormat, c.Query("s"))
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
@@ -65,12 +65,25 @@ func Index(c *gin.Context) {
 	})
 }
 func ApiDoc(c *gin.Context) {
 	c.HTML(http.StatusOK, "api-doc.html", gin.H{
 		"title": "TTS",
 	})
 }
 type SynthesizeVoiceRequest struct {
 	Text         string `json:"t"`
 	VoiceName    string `json:"v"`
 	Rate         string `json:"r"`
 	Pitch        string `json:"p"`
 	OutputFormat string `json:"o"`
 	Style        string `json:"s"`
 }
 type SynthesizeVoiceOpenAIRequest struct {
 	Model string `json:"model"`
 	Input string `json:"input"`
 	Voice string `json:"voice"`
 }
 func SynthesizeVoicePost(c *gin.Context) {
@@ -80,7 +93,7 @@ func SynthesizeVoicePost(c *gin.Context) {
 		return
 	}
-	voice, err := utils.GetVoice(request.Text, request.VoiceName, request.Rate, request.Pitch, request.OutputFormat)
+	voice, err := utils.GetVoice(request.Text, request.VoiceName, request.Rate, request.Pitch, request.OutputFormat, request.Style)
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
@@ -88,3 +101,19 @@ func SynthesizeVoicePost(c *gin.Context) {
 	c.Data(http.StatusOK, "audio/mpeg", voice)
 }
 func SynthesizeVoiceOpenAI(c *gin.Context) {
 	var request SynthesizeVoiceOpenAIRequest
 	if err := c.BindJSON(&request); err != nil {
 		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 		return
 	}
 	voice, err := utils.GetVoice(request.Input, request.Voice, c.Query("r"), c.Query("p"), c.Query("o"), c.Query("s"))
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
 	c.Data(http.StatusOK, "audio/mpeg", voice)
 }
--- a/routes/routes.go
+++ b/routes/routes.go
@@ -1,8 +1,9 @@
 package routes
 import (
 	"github.com/gin-gonic/gin"
 	"tts/handlers"
 	"github.com/gin-gonic/gin"
 )
 func SetupRouter() *gin.Engine {
@@ -14,7 +15,9 @@ func SetupRouter() *gin.Engine {
 	router.GET("/voices", handlers.GetVoiceList)
 	router.POST("/tts", handlers.SynthesizeVoicePost)
 	router.GET("/tts", handlers.SynthesizeVoice)
 	router.GET("/v1/audio/speech", handlers.SynthesizeVoiceOpenAI)
 	router.GET("/", handlers.Index)
 	router.GET("/doc", handlers.ApiDoc)
 	return router
 }
--- a/templates/api-doc.html
+++ b/templates/api-doc.html
@@ -0,0 +1,37 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <title>TTS</title>
 </head>
 <body>
 <h1> 支持接口 </h1>
 <h2>语音合成</h2>
 <div>
    <strong>/tts</strong> | GET / POST(json)
    <a target="_blank" href="/tts?t=岂曰无衣？与子同袍。王于兴师，修我戈矛，与子同仇！岂曰无衣？与子同泽。王于兴师，修我矛戟，与子偕作！岂曰无衣？与子同裳。王于兴师，修我甲兵，与子偕行!&v=zh-CN-XiaoxiaoMultilingualNeural&r=0&p=0&o=audio-24khz-48kbitrate-mono-mp3">try</a>
 </div>
 <pre>
 参数列表：
 1. t: 文本内容 (必填)
 2. v: 语音名称 (可选), 默认为 zh-CN-XiaoxiaoMultilingualNeural
 3. r: 语速 (可选), 默认为 0
 4. p: 语调 (可选), 默认为 0
 5. o: 输出格式 (可选), 默认为audio-24khz-48kbitrate-mono-mp3
 </pre>
 <h2>声音列表</h2>
 <div>
    <strong>/voices</strong> | GET <a target="_blank" href="/voices?l=zh">try</a>
 </div>
 <pre>
 参数列表：
 1. l: 语言区域 (可选), 使用 contains 匹配,如 l=zh
 2. d: 显示详细信息 (可选) , 默认为 false, 如需显示详细信息, 请添加参数d , 如 /voices?d
 </pre>
 </body>
 </html>
--- a/templates/index.html
+++ b/templates/index.html
@@ -2,36 +2,120 @@
 <html lang="en">
 <head>
    <meta charset="UTF-8">
-    <title>TTS</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>TTS Demo</title>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
    <script src="https://cdn.tailwindcss.com"></script>
    <style>
        .top-right {
            position: absolute;
            top: 20px;
            right: 20px;
        }
    </style>
 </head>
-<body>
+<body class="bg-gradient-to-r from-blue-100 to-purple-100 min-h-screen flex items-center justify-center p-4">
-<h1> 支持接口 </h1>
+<div class="top-right">
-<h2>语音合成</h2>
+    <a href="/doc" class="hover:underline p-2 rounded">Documentation</a>
-<div>
+</div>
-    <strong>/tts</strong> | GET / POST(json)
+<div class="bg-white p-8 rounded-xl shadow-lg w-full max-w-4xl">
-    <a target="_blank" href="/tts?t=岂曰无衣？与子同袍。王于兴师，修我戈矛，与子同仇！岂曰无衣？与子同泽。王于兴师，修我矛戟，与子偕作！岂曰无衣？与子同裳。王于兴师，修我甲兵，与子偕行!&v=zh-CN-XiaoxiaoMultilingualNeural&r=0&p=0&o=audio-24khz-48kbitrate-mono-mp3">try</a>
+    <h1 class="text-4xl font-bold mb-8 text-center text-gray-800">语音合成演示</h1>
    <div id="ttsForm" class="space-y-6">
        <textarea id="textInput" rows="6" class="w-full p-4 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500 text-gray-700 text-lg resize-none" placeholder="请输入要合成的文本">欢迎使用我们的语音合成演示系统。这项技术能够将文字转换成自然流畅的语音。您可以尝试调整语速和语调，体验不同的合成效果。我们提供多种语言和声音选项，满足您的各种需求。无论是阅读文章、语言学习，还是辅助视障人士，语音合成技术都能发挥重要作用。希望这个演示能让您感受到科技的魅力。祝您使用愉快！</textarea>
        <div class="grid grid-cols-2 gap-4">
            <div>
                <label for="localeSelect" class="block text-sm font-medium text-gray-700 mb-1">语言</label>
                <select id="localeSelect" class="w-full p-3 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500 text-gray-700">
                    <option value="zh-CN">中文 (中国)</option>
                    <option value="en-US">English (US)</option>
                    <option value="ja-JP">日本語 (日本)</option>
                </select>
            </div>
            <div>
                <label for="voiceSelect" class="block text-sm font-medium text-gray-700 mb-1">声音</label>
                <select id="voiceSelect" class="w-full p-3 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500 text-gray-700"></select>
            </div>
            <div>
                <label for="styleSelect" class="block text-sm font-medium text-gray-700 mb-1">风格</label>
                <select id="styleSelect" class="w-full p-3 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500 text-gray-700"></select>
            </div>
        </div>
        <div class="flex space-x-4">
            <div class="w-1/2 space-y-2">
                <label for="rateInput" class="block text-sm font-medium text-gray-700">语速</label>
                <input type="range" id="rateInput" min="-100" max="100" value="0" class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
            </div>
            <div class="w-1/2 space-y-2">
                <label for="pitchInput" class="block text-sm font-medium text-gray-700">语调</label>
                <input type="range" id="pitchInput" min="-100" max="100" value="0" class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
            </div>
        </div>
        <button id="synthesizeButton" class="w-full bg-gradient-to-r from-blue-500 to-purple-600 text-white py-3 px-6 rounded-lg hover:from-blue-600 hover:to-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500 focus:ring-opacity-50 transition duration-300 text-lg font-semibold shadow-md">合成并播放</button>
    </div>
    <audio id="audioPlayer" controls class="w-full mt-6 hidden"></audio>
 </div>
 <script>
    $(document).ready(function() {
        let globalVoices = []
        function updateVoices(locale) {
            $('#voiceSelect').empty();
            $.get('/voices?d&l=' + locale, function(voices) {
                globalVoices = voices.voices
                globalVoices.forEach(function(voice) {
                    $('#voiceSelect').append($('<option>', {
                        value: voice.ShortName,
                        text: voice.LocalName + ' (' + voice.ShortName + ')'
                    }));
                });
                updateStyles($('#voiceSelect').val());
            });
        }
-<pre>
+        function updateStyles(voice) {
-参数列表：
+            const currentVoice = globalVoices.filter(v => v.ShortName === voice)[0]
-1. t: 文本内容 (必填)
+            if (currentVoice) {
-2. v: 语音名称 (可选), 默认为 zh-CN-XiaoxiaoMultilingualNeural
+                $('#styleSelect').empty()
-3. r: 语速 (可选), 默认为 0
+                currentVoice?.StyleList?.forEach(function(style) {
-4. p: 语调 (可选), 默认为 0
+                    $('#styleSelect').append($('<option>', {
-5. o: 输出格式 (可选), 默认为audio-24khz-48kbitrate-mono-mp3
+                        value: style,
-</pre>
+                        text: style
                    }));
                });
            }
        }
        updateVoices($('#localeSelect').val());
-<h2>声音列表</h2>
+        $('#localeSelect').change(function() {
            updateVoices($(this).val());
        });
-<div>
+        $('#voiceSelect').change(function() {
-    <strong>/voices</strong> | GET <a target="_blank" href="/voices?l=zh">try</a>
+            updateStyles($(this).val());
-</div>
+        });
-<pre>
+
-参数列表：
+        $('#synthesizeButton').click(function() {
-1. l: 语言区域 (可选), 使用 contains 匹配,如 l=zh
+            var text = $('#textInput').val();
-2. d: 显示详细信息 (可选) , 默认为 false, 如需显示详细信息, 请添加参数d , 如 /voices?d
+            var voice = $('#voiceSelect').val();
-</pre>
+            var rate = $('#rateInput').val();
            var pitch = $('#pitchInput').val();
            var locale = $('#localeSelect').val();
            var style = $('#styleSelect').val();
            var url = `/tts?t=${encodeURIComponent(text)}&v=${encodeURIComponent(voice)}&r=${rate}&p=${pitch}&l=${locale}&s=${style}`;
            $('#audioPlayer').attr('src', url).removeClass('hidden')[0].play();
        });
    });
 </script>
 </body>
 </html>
--- a/utils/utils.go
+++ b/utils/utils.go
@@ -47,6 +47,7 @@ const (
 	defaultRate          = "0"
 	defaultPitch         = "0"
 	defaultOutputFormat  = "audio-24khz-48kbitrate-mono-mp3"
 	defaultStyle         = "general"
 )
 var (
@@ -110,8 +111,7 @@ func Sign(urlStr string) string {
 }
 // GetVoice 获取语音合成结果
-// GetVoice 获取语音合成结果
+func GetVoice(text, voiceName, rate, pitch, outputFormat, style string) ([]byte, error) {
 func GetVoice(text, voiceName, rate, pitch, outputFormat string) ([]byte, error) {
 	if voiceName == "" {
 		voiceName = defaultVoiceName
 	}
@@ -125,6 +125,10 @@ func GetVoice(text, voiceName, rate, pitch, outputFormat string) ([]byte, error)
 		outputFormat = defaultOutputFormat
 	}
 	if style == "" {
 		style = defaultStyle
 	}
 	endpoint, err := GetEndpoint()
 	if err != nil {
 		return nil, err
@@ -137,7 +141,7 @@ func GetVoice(text, voiceName, rate, pitch, outputFormat string) ([]byte, error)
 		"X-Microsoft-OutputFormat": outputFormat,
 	}
-	ssml := GetSsml(text, voiceName, rate, pitch)
+	ssml := GetSsml(text, voiceName, rate, pitch, style)
 	req, err := http.NewRequest("POST", u, bytes.NewBufferString(ssml))
 	if err != nil {
@@ -159,18 +163,20 @@ func GetVoice(text, voiceName, rate, pitch, outputFormat string) ([]byte, error)
 }
 // GetSsml 生成 SSML 格式的文本
-func GetSsml(text, voiceName, rate, pitch string) string {
+func GetSsml(text, voiceName, rate, pitch, style string) string {
 	// 对文本进行转义
 	text = html.EscapeString(text)
 	return fmt.Sprintf(`
   <speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" version="1.0" xml:lang="zh-CN">
     <voice name="%s">
-       <mstts:express-as style="general" styledegree="1.0" role="default">
+       <mstts:express-as style="%s" styledegree="1.0" role="default">
-         <prosody rate="%s%%" pitch="%s%%" volume="50">%s</prosody>
+         <prosody rate="%s%%" pitch="%s%%" volume="medium">
 			%s
 		</prosody>
       </mstts:express-as>
     </voice>
   </speak>
- `, voiceName, rate, pitch, text)
+ `, voiceName, style, rate, pitch, text)
 }
 // VoiceList 获取可用的语音列表