feat: add style selection for TTS and update related functionality

This commit is contained in:
王锦强
2025-03-09 17:21:52 +08:00
parent 4ec09304f6
commit 1cd2ac1624
5 changed files with 70 additions and 7 deletions

View File

@@ -153,6 +153,7 @@ func (h *TTSHandler) HandleTTS(w http.ResponseWriter, r *http.Request) {
Voice: q.Get("v"), Voice: q.Get("v"),
Rate: q.Get("r"), Rate: q.Get("r"),
Pitch: q.Get("p"), Pitch: q.Get("p"),
Style: q.Get("s"),
} }
case http.MethodPost: case http.MethodPost:
// 从POST JSON体获取 // 从POST JSON体获取
@@ -174,6 +175,7 @@ func (h *TTSHandler) HandleTTS(w http.ResponseWriter, r *http.Request) {
Voice: r.FormValue("voice"), Voice: r.FormValue("voice"),
Rate: r.FormValue("rate"), Rate: r.FormValue("rate"),
Pitch: r.FormValue("pitch"), Pitch: r.FormValue("pitch"),
Style: r.FormValue("style"),
} }
} }
default: default:

View File

@@ -6,6 +6,7 @@ type TTSRequest struct {
Voice string `json:"voice"` // 语音ID Voice string `json:"voice"` // 语音ID
Rate string `json:"rate"` // 语速 (-100% 到 +100%) Rate string `json:"rate"` // 语速 (-100% 到 +100%)
Pitch string `json:"pitch"` // 语调 (-100% 到 +100%) Pitch string `json:"pitch"` // 语调 (-100% 到 +100%)
Style string `json:"style"` // 说话风格
} }
// TTSResponse 表示一个语音合成响应 // TTSResponse 表示一个语音合成响应

View File

@@ -25,7 +25,7 @@ const (
ttsEndpoint = "https://%s.tts.speech.microsoft.com/cognitiveservices/v1" ttsEndpoint = "https://%s.tts.speech.microsoft.com/cognitiveservices/v1"
ssmlTemplate = `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts="http://www.w3.org/2001/mstts" xml:lang='%s'> ssmlTemplate = `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts="http://www.w3.org/2001/mstts" xml:lang='%s'>
<voice name='%s'> <voice name='%s'>
<mstts:express-as style="general" styledegree="1.0" role="default"> <mstts:express-as style="%s" styledegree="1.0" role="default">
<prosody rate='%s%%' pitch='%s%%' volume="medium"> <prosody rate='%s%%' pitch='%s%%' volume="medium">
%s %s
</prosody> </prosody>
@@ -227,6 +227,11 @@ func (c *Client) createTTSRequest(ctx context.Context, req models.TTSRequest) (*
voice = c.defaultVoice voice = c.defaultVoice
} }
style := req.Style
if req.Style == "" {
style = "general"
}
rate := req.Rate rate := req.Rate
if rate == "" { if rate == "" {
rate = c.defaultRate rate = c.defaultRate
@@ -249,7 +254,7 @@ func (c *Client) createTTSRequest(ctx context.Context, req models.TTSRequest) (*
escapedText := html.EscapeString(req.Text) escapedText := html.EscapeString(req.Text)
// 准备SSML内容 // 准备SSML内容
ssml := fmt.Sprintf(ssmlTemplate, locale, voice, rate, pitch, escapedText) ssml := fmt.Sprintf(ssmlTemplate, locale, voice, style, rate, pitch, escapedText)
// 获取端点信息 // 获取端点信息
endpoint, err := c.getEndpoint(ctx) endpoint, err := c.getEndpoint(ctx)

View File

@@ -2,6 +2,7 @@ document.addEventListener('DOMContentLoaded', function() {
// 获取DOM元素 // 获取DOM元素
const textInput = document.getElementById('text'); const textInput = document.getElementById('text');
const voiceSelect = document.getElementById('voice'); const voiceSelect = document.getElementById('voice');
const styleSelect = document.getElementById('style');
const rateInput = document.getElementById('rate'); const rateInput = document.getElementById('rate');
const rateValue = document.getElementById('rateValue'); const rateValue = document.getElementById('rateValue');
const pitchInput = document.getElementById('pitch'); const pitchInput = document.getElementById('pitch');
@@ -15,6 +16,8 @@ document.addEventListener('DOMContentLoaded', function() {
// 保存最后一个音频URL // 保存最后一个音频URL
let lastAudioUrl = ''; let lastAudioUrl = '';
// 存储语音数据
let voicesData = [];
// 初始化 // 初始化
initVoicesList(); initVoicesList();
@@ -37,13 +40,18 @@ document.addEventListener('DOMContentLoaded', function() {
pitchValue.textContent = value + '%'; pitchValue.textContent = value + '%';
}); });
// 语音选择变化时更新可用风格
voiceSelect.addEventListener('change', function() {
updateStyleOptions();
});
// 获取可用语音列表 // 获取可用语音列表
async function initVoicesList() { async function initVoicesList() {
try { try {
const response = await fetch(`${config.basePath}/voices`); const response = await fetch(`${config.basePath}/voices`);
if (!response.ok) throw new Error('获取语音列表失败'); if (!response.ok) throw new Error('获取语音列表失败');
const voices = await response.json(); voicesData = await response.json();
// 清空并重建选项 // 清空并重建选项
voiceSelect.innerHTML = ''; voiceSelect.innerHTML = '';
@@ -51,7 +59,7 @@ document.addEventListener('DOMContentLoaded', function() {
// 按语言和名称分组 // 按语言和名称分组
const voicesByLocale = {}; const voicesByLocale = {};
voices.forEach(voice => { voicesData.forEach(voice => {
if (!voicesByLocale[voice.locale]) { if (!voicesByLocale[voice.locale]) {
voicesByLocale[voice.locale] = []; voicesByLocale[voice.locale] = [];
} }
@@ -78,12 +86,49 @@ document.addEventListener('DOMContentLoaded', function() {
voiceSelect.appendChild(optgroup); voiceSelect.appendChild(optgroup);
} }
// 初始化风格列表
updateStyleOptions();
} catch (error) { } catch (error) {
console.error('获取语音列表失败:', error); console.error('获取语音列表失败:', error);
voiceSelect.innerHTML = '<option value="">无法加载语音列表</option>'; voiceSelect.innerHTML = '<option value="">无法加载语音列表</option>';
} }
} }
// 更新风格选项
function updateStyleOptions() {
// 清空风格选择
styleSelect.innerHTML = '';
// 获取当前选中的语音
const selectedVoice = voiceSelect.value;
const voiceData = voicesData.find(v => v.short_name === selectedVoice);
if (!voiceData || !voiceData.style_list || voiceData.style_list.length === 0) {
// 如果没有可用风格,添加默认选项
const option = document.createElement('option');
option.value = "general";
option.textContent = "普通";
styleSelect.appendChild(option);
return;
}
// 添加可用风格选项
voiceData.style_list.forEach(style => {
const option = document.createElement('option');
option.value = style
option.textContent = style
// 如果是默认风格则选中
if (style === config.defaultStyle ||
(!config.defaultStyle && style === "general")) {
option.selected = true;
}
styleSelect.appendChild(option);
});
}
// 初始化事件监听器 // 初始化事件监听器
function initEventListeners() { function initEventListeners() {
// 转换按钮点击事件 // 转换按钮点击事件
@@ -137,6 +182,7 @@ document.addEventListener('DOMContentLoaded', function() {
} }
const voice = voiceSelect.value; const voice = voiceSelect.value;
const style = styleSelect.value;
const rate = rateInput.value; const rate = rateInput.value;
const pitch = pitchInput.value; const pitch = pitchInput.value;
@@ -149,6 +195,7 @@ document.addEventListener('DOMContentLoaded', function() {
const params = new URLSearchParams({ const params = new URLSearchParams({
t: text, t: text,
v: voice, v: voice,
s: style,
r: rate, r: rate,
p: pitch p: pitch
}); });

View File

@@ -34,15 +34,22 @@
</select> </select>
</div> </div>
<div class="setting-group">
<label for="style">风格:</label>
<select id="style">
<option value="loading">加载中...</option>
</select>
</div>
<div class="setting-group"> <div class="setting-group">
<label for="rate">语速:</label> <label for="rate">语速:</label>
<input type="range" id="rate" min="-50" max="50" value="0"> <input type="range" id="rate" min="-100" max="100" value="0">
<span id="rateValue">0%</span> <span id="rateValue">0%</span>
</div> </div>
<div class="setting-group"> <div class="setting-group">
<label for="pitch">语调:</label> <label for="pitch">语调:</label>
<input type="range" id="pitch" min="-50" max="50" value="0"> <input type="range" id="pitch" min="-100" max="100" value="0">
<span id="pitchValue">0%</span> <span id="pitchValue">0%</span>
</div> </div>
</div> </div>
@@ -75,7 +82,8 @@
basePath: "{{.BasePath}}", basePath: "{{.BasePath}}",
defaultVoice: "{{.DefaultVoice}}", defaultVoice: "{{.DefaultVoice}}",
defaultRate: "{{.DefaultRate}}", defaultRate: "{{.DefaultRate}}",
defaultPitch: "{{.DefaultPitch}}" defaultPitch: "{{.DefaultPitch}}",
defaultStyle: "{{.DefaultStyle}}"
}; };
</script> </script>
<script src="{{.BasePath}}/static/js/app.js"></script> <script src="{{.BasePath}}/static/js/app.js"></script>