feat 添加风格支持及首页

This commit is contained in:
zuoban
2024-10-18 21:06:38 +08:00
parent d7280f7fbb
commit 62883ccb99
5 changed files with 194 additions and 35 deletions

View File

@@ -50,7 +50,7 @@ func SynthesizeVoice(c *gin.Context) {
pitch := c.DefaultQuery("p", "0") pitch := c.DefaultQuery("p", "0")
outputFormat := c.DefaultQuery("o", "audio-24khz-48kbitrate-mono-mp3") outputFormat := c.DefaultQuery("o", "audio-24khz-48kbitrate-mono-mp3")
voice, err := utils.GetVoice(text, voiceName, rate, pitch, outputFormat) voice, err := utils.GetVoice(text, voiceName, rate, pitch, outputFormat, c.Query("s"))
if err != nil { if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return return
@@ -65,12 +65,25 @@ func Index(c *gin.Context) {
}) })
} }
func ApiDoc(c *gin.Context) {
c.HTML(http.StatusOK, "api-doc.html", gin.H{
"title": "TTS",
})
}
type SynthesizeVoiceRequest struct { type SynthesizeVoiceRequest struct {
Text string `json:"t"` Text string `json:"t"`
VoiceName string `json:"v"` VoiceName string `json:"v"`
Rate string `json:"r"` Rate string `json:"r"`
Pitch string `json:"p"` Pitch string `json:"p"`
OutputFormat string `json:"o"` OutputFormat string `json:"o"`
Style string `json:"s"`
}
type SynthesizeVoiceOpenAIRequest struct {
Model string `json:"model"`
Input string `json:"input"`
Voice string `json:"voice"`
} }
func SynthesizeVoicePost(c *gin.Context) { func SynthesizeVoicePost(c *gin.Context) {
@@ -80,7 +93,7 @@ func SynthesizeVoicePost(c *gin.Context) {
return return
} }
voice, err := utils.GetVoice(request.Text, request.VoiceName, request.Rate, request.Pitch, request.OutputFormat) voice, err := utils.GetVoice(request.Text, request.VoiceName, request.Rate, request.Pitch, request.OutputFormat, request.Style)
if err != nil { if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return return
@@ -88,3 +101,19 @@ func SynthesizeVoicePost(c *gin.Context) {
c.Data(http.StatusOK, "audio/mpeg", voice) c.Data(http.StatusOK, "audio/mpeg", voice)
} }
func SynthesizeVoiceOpenAI(c *gin.Context) {
var request SynthesizeVoiceOpenAIRequest
if err := c.BindJSON(&request); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
voice, err := utils.GetVoice(request.Input, request.Voice, c.Query("r"), c.Query("p"), c.Query("o"), c.Query("s"))
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
c.Data(http.StatusOK, "audio/mpeg", voice)
}

View File

@@ -1,8 +1,9 @@
package routes package routes
import ( import (
"github.com/gin-gonic/gin"
"tts/handlers" "tts/handlers"
"github.com/gin-gonic/gin"
) )
func SetupRouter() *gin.Engine { func SetupRouter() *gin.Engine {
@@ -14,7 +15,9 @@ func SetupRouter() *gin.Engine {
router.GET("/voices", handlers.GetVoiceList) router.GET("/voices", handlers.GetVoiceList)
router.POST("/tts", handlers.SynthesizeVoicePost) router.POST("/tts", handlers.SynthesizeVoicePost)
router.GET("/tts", handlers.SynthesizeVoice) router.GET("/tts", handlers.SynthesizeVoice)
router.GET("/v1/audio/speech", handlers.SynthesizeVoiceOpenAI)
router.GET("/", handlers.Index) router.GET("/", handlers.Index)
router.GET("/doc", handlers.ApiDoc)
return router return router
} }

37
templates/api-doc.html Normal file
View File

@@ -0,0 +1,37 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>TTS</title>
</head>
<body>
<h1> 支持接口 </h1>
<h2>语音合成</h2>
<div>
<strong>/tts</strong> | GET / POST(json)
<a target="_blank" href="/tts?t=岂曰无衣?与子同袍。王于兴师,修我戈矛,与子同仇!岂曰无衣?与子同泽。王于兴师,修我矛戟,与子偕作!岂曰无衣?与子同裳。王于兴师,修我甲兵,与子偕行!&v=zh-CN-XiaoxiaoMultilingualNeural&r=0&p=0&o=audio-24khz-48kbitrate-mono-mp3">try</a>
</div>
<pre>
参数列表:
1. t: 文本内容 (必填)
2. v: 语音名称 (可选), 默认为 zh-CN-XiaoxiaoMultilingualNeural
3. r: 语速 (可选), 默认为 0
4. p: 语调 (可选), 默认为 0
5. o: 输出格式 (可选), 默认为audio-24khz-48kbitrate-mono-mp3
</pre>
<h2>声音列表</h2>
<div>
<strong>/voices</strong> | GET <a target="_blank" href="/voices?l=zh">try</a>
</div>
<pre>
参数列表:
1. l: 语言区域 (可选), 使用 contains 匹配,如 l=zh
2. d: 显示详细信息 (可选) , 默认为 false, 如需显示详细信息, 请添加参数d , 如 /voices?d
</pre>
</body>
</html>

View File

@@ -2,36 +2,120 @@
<html lang="en"> <html lang="en">
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<title>TTS</title> <meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>TTS Demo</title>
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
<script src="https://cdn.tailwindcss.com"></script>
<style>
.top-right {
position: absolute;
top: 20px;
right: 20px;
}
</style>
</head> </head>
<body> <body class="bg-gradient-to-r from-blue-100 to-purple-100 min-h-screen flex items-center justify-center p-4">
<h1> 支持接口 </h1> <div class="top-right">
<h2>语音合成</h2> <a href="/doc" class="hover:underline p-2 rounded">Documentation</a>
<div> </div>
<strong>/tts</strong> | GET / POST(json) <div class="bg-white p-8 rounded-xl shadow-lg w-full max-w-4xl">
<a target="_blank" href="/tts?t=岂曰无衣?与子同袍。王于兴师,修我戈矛,与子同仇!岂曰无衣?与子同泽。王于兴师,修我矛戟,与子偕作!岂曰无衣?与子同裳。王于兴师,修我甲兵,与子偕行!&v=zh-CN-XiaoxiaoMultilingualNeural&r=0&p=0&o=audio-24khz-48kbitrate-mono-mp3">try</a> <h1 class="text-4xl font-bold mb-8 text-center text-gray-800">语音合成演示</h1>
<div id="ttsForm" class="space-y-6">
<textarea id="textInput" rows="6" class="w-full p-4 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500 text-gray-700 text-lg resize-none" placeholder="请输入要合成的文本">欢迎使用我们的语音合成演示系统。这项技术能够将文字转换成自然流畅的语音。您可以尝试调整语速和语调,体验不同的合成效果。我们提供多种语言和声音选项,满足您的各种需求。无论是阅读文章、语言学习,还是辅助视障人士,语音合成技术都能发挥重要作用。希望这个演示能让您感受到科技的魅力。祝您使用愉快!</textarea>
<div class="grid grid-cols-2 gap-4">
<div>
<label for="localeSelect" class="block text-sm font-medium text-gray-700 mb-1">语言</label>
<select id="localeSelect" class="w-full p-3 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500 text-gray-700">
<option value="zh-CN">中文 (中国)</option>
<option value="en-US">English (US)</option>
<option value="ja-JP">日本語 (日本)</option>
</select>
</div>
<div>
<label for="voiceSelect" class="block text-sm font-medium text-gray-700 mb-1">声音</label>
<select id="voiceSelect" class="w-full p-3 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500 text-gray-700"></select>
</div>
<div>
<label for="styleSelect" class="block text-sm font-medium text-gray-700 mb-1">风格</label>
<select id="styleSelect" class="w-full p-3 border border-gray-300 rounded-lg focus:outline-none focus:ring-2 focus:ring-blue-500 text-gray-700"></select>
</div>
</div>
<div class="flex space-x-4">
<div class="w-1/2 space-y-2">
<label for="rateInput" class="block text-sm font-medium text-gray-700">语速</label>
<input type="range" id="rateInput" min="-100" max="100" value="0" class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
</div>
<div class="w-1/2 space-y-2">
<label for="pitchInput" class="block text-sm font-medium text-gray-700">语调</label>
<input type="range" id="pitchInput" min="-100" max="100" value="0" class="w-full h-2 bg-gray-200 rounded-lg appearance-none cursor-pointer">
</div>
</div>
<button id="synthesizeButton" class="w-full bg-gradient-to-r from-blue-500 to-purple-600 text-white py-3 px-6 rounded-lg hover:from-blue-600 hover:to-purple-700 focus:outline-none focus:ring-2 focus:ring-purple-500 focus:ring-opacity-50 transition duration-300 text-lg font-semibold shadow-md">合成并播放</button>
</div>
<audio id="audioPlayer" controls class="w-full mt-6 hidden"></audio>
</div> </div>
<script>
$(document).ready(function() {
let globalVoices = []
function updateVoices(locale) {
$('#voiceSelect').empty();
$.get('/voices?d&l=' + locale, function(voices) {
globalVoices = voices.voices
globalVoices.forEach(function(voice) {
$('#voiceSelect').append($('<option>', {
value: voice.ShortName,
text: voice.LocalName + ' (' + voice.ShortName + ')'
}));
});
updateStyles($('#voiceSelect').val());
});
}
<pre> function updateStyles(voice) {
参数列表: const currentVoice = globalVoices.filter(v => v.ShortName === voice)[0]
1. t: 文本内容 (必填) if (currentVoice) {
2. v: 语音名称 (可选), 默认为 zh-CN-XiaoxiaoMultilingualNeural $('#styleSelect').empty()
3. r: 语速 (可选), 默认为 0 currentVoice?.StyleList?.forEach(function(style) {
4. p: 语调 (可选), 默认为 0 $('#styleSelect').append($('<option>', {
5. o: 输出格式 (可选), 默认为audio-24khz-48kbitrate-mono-mp3 value: style,
</pre> text: style
}));
});
}
}
updateVoices($('#localeSelect').val());
<h2>声音列表</h2> $('#localeSelect').change(function() {
updateVoices($(this).val());
});
<div> $('#voiceSelect').change(function() {
<strong>/voices</strong> | GET <a target="_blank" href="/voices?l=zh">try</a> updateStyles($(this).val());
</div> });
<pre>
参数列表: $('#synthesizeButton').click(function() {
1. l: 语言区域 (可选), 使用 contains 匹配,如 l=zh var text = $('#textInput').val();
2. d: 显示详细信息 (可选) , 默认为 false, 如需显示详细信息, 请添加参数d , 如 /voices?d var voice = $('#voiceSelect').val();
</pre> var rate = $('#rateInput').val();
var pitch = $('#pitchInput').val();
var locale = $('#localeSelect').val();
var style = $('#styleSelect').val();
var url = `/tts?t=${encodeURIComponent(text)}&v=${encodeURIComponent(voice)}&r=${rate}&p=${pitch}&l=${locale}&s=${style}`;
$('#audioPlayer').attr('src', url).removeClass('hidden')[0].play();
});
});
</script>
</body> </body>
</html> </html>

View File

@@ -47,6 +47,7 @@ const (
defaultRate = "0" defaultRate = "0"
defaultPitch = "0" defaultPitch = "0"
defaultOutputFormat = "audio-24khz-48kbitrate-mono-mp3" defaultOutputFormat = "audio-24khz-48kbitrate-mono-mp3"
defaultStyle = "general"
) )
var ( var (
@@ -110,8 +111,7 @@ func Sign(urlStr string) string {
} }
// GetVoice 获取语音合成结果 // GetVoice 获取语音合成结果
// GetVoice 获取语音合成结果 func GetVoice(text, voiceName, rate, pitch, outputFormat, style string) ([]byte, error) {
func GetVoice(text, voiceName, rate, pitch, outputFormat string) ([]byte, error) {
if voiceName == "" { if voiceName == "" {
voiceName = defaultVoiceName voiceName = defaultVoiceName
} }
@@ -125,6 +125,10 @@ func GetVoice(text, voiceName, rate, pitch, outputFormat string) ([]byte, error)
outputFormat = defaultOutputFormat outputFormat = defaultOutputFormat
} }
if style == "" {
style = defaultStyle
}
endpoint, err := GetEndpoint() endpoint, err := GetEndpoint()
if err != nil { if err != nil {
return nil, err return nil, err
@@ -137,7 +141,7 @@ func GetVoice(text, voiceName, rate, pitch, outputFormat string) ([]byte, error)
"X-Microsoft-OutputFormat": outputFormat, "X-Microsoft-OutputFormat": outputFormat,
} }
ssml := GetSsml(text, voiceName, rate, pitch) ssml := GetSsml(text, voiceName, rate, pitch, style)
req, err := http.NewRequest("POST", u, bytes.NewBufferString(ssml)) req, err := http.NewRequest("POST", u, bytes.NewBufferString(ssml))
if err != nil { if err != nil {
@@ -159,18 +163,20 @@ func GetVoice(text, voiceName, rate, pitch, outputFormat string) ([]byte, error)
} }
// GetSsml 生成 SSML 格式的文本 // GetSsml 生成 SSML 格式的文本
func GetSsml(text, voiceName, rate, pitch string) string { func GetSsml(text, voiceName, rate, pitch, style string) string {
// 对文本进行转义 // 对文本进行转义
text = html.EscapeString(text) text = html.EscapeString(text)
return fmt.Sprintf(` return fmt.Sprintf(`
<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" version="1.0" xml:lang="zh-CN"> <speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" version="1.0" xml:lang="zh-CN">
<voice name="%s"> <voice name="%s">
<mstts:express-as style="general" styledegree="1.0" role="default"> <mstts:express-as style="%s" styledegree="1.0" role="default">
<prosody rate="%s%%" pitch="%s%%" volume="50">%s</prosody> <prosody rate="%s%%" pitch="%s%%" volume="medium">
%s
</prosody>
</mstts:express-as> </mstts:express-as>
</voice> </voice>
</speak> </speak>
`, voiceName, rate, pitch, text) `, voiceName, style, rate, pitch, text)
} }
// VoiceList 获取可用的语音列表 // VoiceList 获取可用的语音列表