feat 添加风格支持及首页

2024-10-18 21:06:38 +08:00
parent d7280f7fbb
commit 62883ccb99
5 changed files with 194 additions and 35 deletions
--- a/handlers/handlers.go
+++ b/handlers/handlers.go
@@ -50,7 +50,7 @@ func SynthesizeVoice(c *gin.Context) {
 	pitch := c.DefaultQuery("p", "0")
 	outputFormat := c.DefaultQuery("o", "audio-24khz-48kbitrate-mono-mp3")

-	voice, err := utils.GetVoice(text, voiceName, rate, pitch, outputFormat)
+	voice, err := utils.GetVoice(text, voiceName, rate, pitch, outputFormat, c.Query("s"))
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
@@ -65,12 +65,25 @@ func Index(c *gin.Context) {
 	})
 }

+func ApiDoc(c *gin.Context) {
+	c.HTML(http.StatusOK, "api-doc.html", gin.H{
+		"title": "TTS",
+	})
+}
+
 type SynthesizeVoiceRequest struct {
 	Text         string `json:"t"`
 	VoiceName    string `json:"v"`
 	Rate         string `json:"r"`
 	Pitch        string `json:"p"`
 	OutputFormat string `json:"o"`
+	Style        string `json:"s"`
+}
+
+type SynthesizeVoiceOpenAIRequest struct {
+	Model string `json:"model"`
+	Input string `json:"input"`
+	Voice string `json:"voice"`
 }

 func SynthesizeVoicePost(c *gin.Context) {
@@ -80,7 +93,7 @@ func SynthesizeVoicePost(c *gin.Context) {
 		return
 	}

-	voice, err := utils.GetVoice(request.Text, request.VoiceName, request.Rate, request.Pitch, request.OutputFormat)
+	voice, err := utils.GetVoice(request.Text, request.VoiceName, request.Rate, request.Pitch, request.OutputFormat, request.Style)
 	if err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
@@ -88,3 +101,19 @@ func SynthesizeVoicePost(c *gin.Context) {

 	c.Data(http.StatusOK, "audio/mpeg", voice)
 }
+
+func SynthesizeVoiceOpenAI(c *gin.Context) {
+	var request SynthesizeVoiceOpenAIRequest
+	if err := c.BindJSON(&request); err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
+		return
+	}
+
+	voice, err := utils.GetVoice(request.Input, request.Voice, c.Query("r"), c.Query("p"), c.Query("o"), c.Query("s"))
+
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+		return
+	}
+	c.Data(http.StatusOK, "audio/mpeg", voice)
+}
--- a/routes/routes.go
+++ b/routes/routes.go
@@ -1,8 +1,9 @@
 package routes

 import (
-	"github.com/gin-gonic/gin"
 	"tts/handlers"
+
+	"github.com/gin-gonic/gin"
 )

 func SetupRouter() *gin.Engine {
@@ -14,7 +15,9 @@ func SetupRouter() *gin.Engine {
 	router.GET("/voices", handlers.GetVoiceList)
 	router.POST("/tts", handlers.SynthesizeVoicePost)
 	router.GET("/tts", handlers.SynthesizeVoice)
+	router.GET("/v1/audio/speech", handlers.SynthesizeVoiceOpenAI)
 	router.GET("/", handlers.Index)
+	router.GET("/doc", handlers.ApiDoc)

 	return router
 }
--- a/templates/api-doc.html
+++ b/templates/api-doc.html
@@ -0,0 +1,37 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>TTS</title>
+</head>
+<body>
+<h1> 支持接口 </h1>
+<h2>语音合成</h2>
+<div>
+    <strong>/tts</strong> | GET / POST(json)
+    <a target="_blank" href="/tts?t=岂曰无衣？与子同袍。王于兴师，修我戈矛，与子同仇！岂曰无衣？与子同泽。王于兴师，修我矛戟，与子偕作！岂曰无衣？与子同裳。王于兴师，修我甲兵，与子偕行!&v=zh-CN-XiaoxiaoMultilingualNeural&r=0&p=0&o=audio-24khz-48kbitrate-mono-mp3">try</a>
+</div>
+
+
+<pre>
+参数列表：
+1. t: 文本内容 (必填)
+2. v: 语音名称 (可选), 默认为 zh-CN-XiaoxiaoMultilingualNeural
+3. r: 语速 (可选), 默认为 0
+4. p: 语调 (可选), 默认为 0
+5. o: 输出格式 (可选), 默认为audio-24khz-48kbitrate-mono-mp3
+</pre>
+
+
+<h2>声音列表</h2>
+
+<div>
+    <strong>/voices</strong> | GET <a target="_blank" href="/voices?l=zh">try</a>
+</div>
+<pre>
+参数列表：
+1. l: 语言区域 (可选), 使用 contains 匹配,如 l=zh
+2. d: 显示详细信息 (可选) , 默认为 false, 如需显示详细信息, 请添加参数d , 如 /voices?d
+</pre>
+</body>
+</html>
--- a/templates/index.html
+++ b/templates/index.html
@@ -2,36 +2,120 @@
 <html lang="en">
 <head>
    <meta charset="UTF-8">
-    <title>TTS</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>TTS Demo</title>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <style>
+        .top-right {
+            position: absolute;
+            top: 20px;
+            right: 20px;
+        }
+    </style>
 </head>
-<body>
-<h1> 支持接口 </h1>
-<h2>语音合成</h2>
-<div>
-    <strong>/tts</strong> | GET / POST(json)
-    <a target="_blank" href="/tts?t=岂曰无衣？与子同袍。王于兴师，修我戈矛，与子同仇！岂曰无衣？与子同泽。王于兴师，修我矛戟，与子偕作！岂曰无衣？与子同裳。王于兴师，修我甲兵，与子偕行!&v=zh-CN-XiaoxiaoMultilingualNeural&r=0&p=0&o=audio-24khz-48kbitrate-mono-mp3">try</a>
+<body class="bg-gradient-to-r from-blue-100 to-purple-100 min-h-screen flex items-center justify-center p-4">
+<div class="top-right">
+    <a href="/doc" class="hover:underline p-2 rounded">Documentation</a>
+</div>
+<div class="bg-white p-8 rounded-xl shadow-lg w-full max-w-4xl">
+    <h1 class="text-4xl font-bold mb-8 text-center text-gray-800">语音合成演示</h1>
+
+
+    <div id="ttsForm" class="space-y-6">
+        <textarea id="textInput" rows="6" class="w-full p-4 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500 text-gray-700 text-lg resize-none" placeholder="请输入要合成的文本">欢迎使用我们的语音合成演示系统。这项技术能够将文字转换成自然流畅的语音。您可以尝试调整语速和语调，体验不同的合成效果。我们提供多种语言和声音选项，满足您的各种需求。无论是阅读文章、语言学习，还是辅助视障人士，语音合成技术都能发挥重要作用。希望这个演示能让您感受到科技的魅力。祝您使用愉快！</textarea>
+
+        <div class="grid grid-cols-2 gap-4">
+            <div>
+                <label for="localeSelect" class="block text-sm font-medium text-gray-700 mb-1">语言</label>
+                <select id="localeSelect" class="w-full p-3 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500 text-gray-700">
+                    <option value="zh-CN">中文 (中国)</option>
+                    <option value="en-US">English (US)</option>
+                    <option value="ja-JP">日本語 (日本)</option>
+                </select>
+            </div>
+            <div>
+                <label for="voiceSelect" class="block text-sm font-medium text-gray-700 mb-1">声音</label>
+                <select id="voiceSelect" class="w-full p-3 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500 text-gray-700"></select>
+            </div>
+            <div>
+                <label for="styleSelect" class="block text-sm font-medium text-gray-700 mb-1">风格</label>
+                <select id="styleSelect" class="w-full p-3 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500 text-gray-700"></select>
+            </div>
+        </div>
+
+        <div class="flex space-x-4">
+            <div class="w-1/2 space-y-2">
+                <label for="rateInput" class="block text-sm font-medium text-gray-700">语速</label>
+                <input type="range" id="rateInput" min="-100" max="100" value="0" class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
+            </div>
+
+            <div class="w-1/2 space-y-2">
+                <label for="pitchInput" class="block text-sm font-medium text-gray-700">语调</label>
+                <input type="range" id="pitchInput" min="-100" max="100" value="0" class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
+            </div>
+        </div>
+
+        <button id="synthesizeButton" class="w-full bg-gradient-to-r from-blue-500 to-purple-600 text-white py-3 px-6 rounded-lg hover:from-blue-600 hover:to-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500 focus:ring-opacity-50 transition duration-300 text-lg font-semibold shadow-md">合成并播放</button>
+    </div>
+
+    <audio id="audioPlayer" controls class="w-full mt-6 hidden"></audio>
 </div>

+<script>
+    $(document).ready(function() {
+        let globalVoices = []
+        function updateVoices(locale) {
+            $('#voiceSelect').empty();
+            $.get('/voices?d&l=' + locale, function(voices) {
+                globalVoices = voices.voices
+                globalVoices.forEach(function(voice) {
+                    $('#voiceSelect').append($('<option>', {
+                        value: voice.ShortName,
+                        text: voice.LocalName + ' (' + voice.ShortName + ')'
+                    }));
+                });
+                updateStyles($('#voiceSelect').val());
+            });
+        }

-<pre>
-参数列表：
-1. t: 文本内容 (必填)
-2. v: 语音名称 (可选), 默认为 zh-CN-XiaoxiaoMultilingualNeural
-3. r: 语速 (可选), 默认为 0
-4. p: 语调 (可选), 默认为 0
-5. o: 输出格式 (可选), 默认为audio-24khz-48kbitrate-mono-mp3
-</pre>
+        function updateStyles(voice) {
+            const currentVoice = globalVoices.filter(v => v.ShortName === voice)[0]
+            if (currentVoice) {
+                $('#styleSelect').empty()
+                currentVoice?.StyleList?.forEach(function(style) {
+                    $('#styleSelect').append($('<option>', {
+                        value: style,
+                        text: style
+                    }));
+                });
+            }
+        }

+        updateVoices($('#localeSelect').val());

-<h2>声音列表</h2>
+        $('#localeSelect').change(function() {
+            updateVoices($(this).val());
+        });

-<div>
-    <strong>/voices</strong> | GET <a target="_blank" href="/voices?l=zh">try</a>
-</div>
-<pre>
-参数列表：
-1. l: 语言区域 (可选), 使用 contains 匹配,如 l=zh
-2. d: 显示详细信息 (可选) , 默认为 false, 如需显示详细信息, 请添加参数d , 如 /voices?d
-</pre>
+        $('#voiceSelect').change(function() {
+            updateStyles($(this).val());
+        });
+
+        $('#synthesizeButton').click(function() {
+            var text = $('#textInput').val();
+            var voice = $('#voiceSelect').val();
+            var rate = $('#rateInput').val();
+            var pitch = $('#pitchInput').val();
+            var locale = $('#localeSelect').val();
+            var style = $('#styleSelect').val();
+
+            var url = `/tts?t=${encodeURIComponent(text)}&v=${encodeURIComponent(voice)}&r=${rate}&p=${pitch}&l=${locale}&s=${style}`;
+
+            $('#audioPlayer').attr('src', url).removeClass('hidden')[0].play();
+        });
+    });
+</script>
 </body>
-</html>
+</html>
+
--- a/utils/utils.go
+++ b/utils/utils.go
@@ -47,6 +47,7 @@ const (
 	defaultRate          = "0"
 	defaultPitch         = "0"
 	defaultOutputFormat  = "audio-24khz-48kbitrate-mono-mp3"
+	defaultStyle         = "general"
 )

 var (
@@ -110,8 +111,7 @@ func Sign(urlStr string) string {
 }

 // GetVoice 获取语音合成结果
-// GetVoice 获取语音合成结果
-func GetVoice(text, voiceName, rate, pitch, outputFormat string) ([]byte, error) {
+func GetVoice(text, voiceName, rate, pitch, outputFormat, style string) ([]byte, error) {
 	if voiceName == "" {
 		voiceName = defaultVoiceName
 	}
@@ -125,6 +125,10 @@ func GetVoice(text, voiceName, rate, pitch, outputFormat string) ([]byte, error)
 		outputFormat = defaultOutputFormat
 	}

+	if style == "" {
+		style = defaultStyle
+	}
+
 	endpoint, err := GetEndpoint()
 	if err != nil {
 		return nil, err
@@ -137,7 +141,7 @@ func GetVoice(text, voiceName, rate, pitch, outputFormat string) ([]byte, error)
 		"X-Microsoft-OutputFormat": outputFormat,
 	}

-	ssml := GetSsml(text, voiceName, rate, pitch)
+	ssml := GetSsml(text, voiceName, rate, pitch, style)

 	req, err := http.NewRequest("POST", u, bytes.NewBufferString(ssml))
 	if err != nil {
@@ -159,18 +163,20 @@ func GetVoice(text, voiceName, rate, pitch, outputFormat string) ([]byte, error)
 }

 // GetSsml 生成 SSML 格式的文本
-func GetSsml(text, voiceName, rate, pitch string) string {
+func GetSsml(text, voiceName, rate, pitch, style string) string {
 	// 对文本进行转义
 	text = html.EscapeString(text)
 	return fmt.Sprintf(`
   <speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" version="1.0" xml:lang="zh-CN">
     <voice name="%s">
-       <mstts:express-as style="general" styledegree="1.0" role="default">
-         <prosody rate="%s%%" pitch="%s%%" volume="50">%s</prosody>
+       <mstts:express-as style="%s" styledegree="1.0" role="default">
+         <prosody rate="%s%%" pitch="%s%%" volume="medium">
+			%s
+		</prosody>
       </mstts:express-as>
     </voice>
   </speak>
- `, voiceName, rate, pitch, text)
+ `, voiceName, style, rate, pitch, text)
 }

 // VoiceList 获取可用的语音列表