Fix: Grouped sentence chunks into visual paragraphs and improved TTS response speed
Browse files
App/backend/supertonic_engine.py
CHANGED
|
@@ -95,16 +95,24 @@ class SupertonicVoice:
|
|
| 95 |
|
| 96 |
opts = ort.SessionOptions()
|
| 97 |
providers = ort.get_available_providers()
|
| 98 |
-
|
|
|
|
|
|
|
| 99 |
if "CUDAExecutionProvider" in providers:
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
providers = ["CPUExecutionProvider"]
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
self.sample_rate = self.cfgs["ae"]["sample_rate"]
|
| 110 |
self.base_chunk_size = self.cfgs["ae"]["base_chunk_size"]
|
|
|
|
| 95 |
|
| 96 |
opts = ort.SessionOptions()
|
| 97 |
providers = ort.get_available_providers()
|
| 98 |
+
|
| 99 |
+
# Prefer GPU (CUDA), then CPU
|
| 100 |
+
active_providers = []
|
| 101 |
if "CUDAExecutionProvider" in providers:
|
| 102 |
+
active_providers.append("CUDAExecutionProvider")
|
| 103 |
+
active_providers.append("CPUExecutionProvider")
|
|
|
|
| 104 |
|
| 105 |
+
try:
|
| 106 |
+
self.dp_ort = ort.InferenceSession(os.path.join(onnx_dir, "duration_predictor.onnx"), sess_options=opts, providers=active_providers)
|
| 107 |
+
self.text_enc_ort = ort.InferenceSession(os.path.join(onnx_dir, "text_encoder.onnx"), sess_options=opts, providers=active_providers)
|
| 108 |
+
self.vector_est_ort = ort.InferenceSession(os.path.join(onnx_dir, "vector_estimator.onnx"), sess_options=opts, providers=active_providers)
|
| 109 |
+
self.vocoder_ort = ort.InferenceSession(os.path.join(onnx_dir, "vocoder.onnx"), sess_options=opts, providers=active_providers)
|
| 110 |
+
except Exception as e:
|
| 111 |
+
print(f"Warning: Failed to load with requested GPU providers. Falling back to CPU. ({e})")
|
| 112 |
+
self.dp_ort = ort.InferenceSession(os.path.join(onnx_dir, "duration_predictor.onnx"), sess_options=opts, providers=["CPUExecutionProvider"])
|
| 113 |
+
self.text_enc_ort = ort.InferenceSession(os.path.join(onnx_dir, "text_encoder.onnx"), sess_options=opts, providers=["CPUExecutionProvider"])
|
| 114 |
+
self.vector_est_ort = ort.InferenceSession(os.path.join(onnx_dir, "vector_estimator.onnx"), sess_options=opts, providers=["CPUExecutionProvider"])
|
| 115 |
+
self.vocoder_ort = ort.InferenceSession(os.path.join(onnx_dir, "vocoder.onnx"), sess_options=opts, providers=["CPUExecutionProvider"])
|
| 116 |
|
| 117 |
self.sample_rate = self.cfgs["ae"]["sample_rate"]
|
| 118 |
self.base_chunk_size = self.cfgs["ae"]["base_chunk_size"]
|
App/frontend/src/App.jsx
CHANGED
|
@@ -260,7 +260,7 @@ function App() {
|
|
| 260 |
|
| 261 |
// Playback Logic
|
| 262 |
const prefetchNextChunks = async (currentIndex) => {
|
| 263 |
-
const PREFETCH_COUNT =
|
| 264 |
const apiUrl = import.meta.env.VITE_API_URL || '';
|
| 265 |
const voice_id = speechEngine === 'supertonic' ? supertonicVoice : piperVoice;
|
| 266 |
const speedParam = (speechEngine === 'supertonic' && useNativeSpeed) ? speechRate : 1.0;
|
|
@@ -614,6 +614,20 @@ function App() {
|
|
| 614 |
}, [view, chunks.length, isPlaying]);
|
| 615 |
|
| 616 |
// Memoized lists
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 617 |
const filteredBooks = useMemo(() => {
|
| 618 |
if (!books || !Array.isArray(books)) return [];
|
| 619 |
let list = books
|
|
@@ -854,9 +868,19 @@ function App() {
|
|
| 854 |
|
| 855 |
<div className="reader-content" onClick={() => { setShowSettings(false); setShowTOC(false); setShowBookmarks(false); }}>
|
| 856 |
<div className="book-page">
|
| 857 |
-
{
|
| 858 |
-
|
| 859 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 860 |
</div>
|
| 861 |
</div>
|
| 862 |
|
|
|
|
| 260 |
|
| 261 |
// Playback Logic
|
| 262 |
const prefetchNextChunks = async (currentIndex) => {
|
| 263 |
+
const PREFETCH_COUNT = 3;
|
| 264 |
const apiUrl = import.meta.env.VITE_API_URL || '';
|
| 265 |
const voice_id = speechEngine === 'supertonic' ? supertonicVoice : piperVoice;
|
| 266 |
const speedParam = (speechEngine === 'supertonic' && useNativeSpeed) ? speechRate : 1.0;
|
|
|
|
| 614 |
}, [view, chunks.length, isPlaying]);
|
| 615 |
|
| 616 |
// Memoized lists
|
| 617 |
+
const paragraphGroups = useMemo(() => {
|
| 618 |
+
const groups = [];
|
| 619 |
+
let currentGroup = null;
|
| 620 |
+
chunks.forEach((chunk, i) => {
|
| 621 |
+
const pId = chunk.paragraphId !== undefined ? chunk.paragraphId : i;
|
| 622 |
+
if (!currentGroup || currentGroup.id !== pId) {
|
| 623 |
+
currentGroup = { id: pId, type: typeof chunk === 'string' ? 'p' : (chunk.type || 'p'), items: [] };
|
| 624 |
+
groups.push(currentGroup);
|
| 625 |
+
}
|
| 626 |
+
currentGroup.items.push({ text: typeof chunk === 'string' ? chunk : chunk.text, globalIndex: i });
|
| 627 |
+
});
|
| 628 |
+
return groups;
|
| 629 |
+
}, [chunks]);
|
| 630 |
+
|
| 631 |
const filteredBooks = useMemo(() => {
|
| 632 |
if (!books || !Array.isArray(books)) return [];
|
| 633 |
let list = books
|
|
|
|
| 868 |
|
| 869 |
<div className="reader-content" onClick={() => { setShowSettings(false); setShowTOC(false); setShowBookmarks(false); }}>
|
| 870 |
<div className="book-page">
|
| 871 |
+
{paragraphGroups.map((group, i) => {
|
| 872 |
+
const isActiveParagraph = group.items.some(item => item.globalIndex === currentChunkIndex);
|
| 873 |
+
return (
|
| 874 |
+
<ReaderParagraph
|
| 875 |
+
key={i}
|
| 876 |
+
group={group}
|
| 877 |
+
currentChunkIndex={currentChunkIndex}
|
| 878 |
+
isActive={isActiveParagraph}
|
| 879 |
+
activeParagraphRef={isActiveParagraph ? activeParagraphRef : null}
|
| 880 |
+
onClick={handleParagraphClick}
|
| 881 |
+
/>
|
| 882 |
+
);
|
| 883 |
+
})}
|
| 884 |
</div>
|
| 885 |
</div>
|
| 886 |
|
App/frontend/src/components/AudioDock.jsx
CHANGED
|
@@ -45,7 +45,6 @@ export default function AudioDock({
|
|
| 45 |
{formatTimeLeft(timeLeft)}
|
| 46 |
</span>
|
| 47 |
)}
|
| 48 |
-
<div className="dock-subtitle">Powered by Piper</div>
|
| 49 |
</div>
|
| 50 |
</div>
|
| 51 |
|
|
|
|
| 45 |
{formatTimeLeft(timeLeft)}
|
| 46 |
</span>
|
| 47 |
)}
|
|
|
|
| 48 |
</div>
|
| 49 |
</div>
|
| 50 |
|
App/frontend/src/components/ReaderParagraph.jsx
CHANGED
|
@@ -1,29 +1,39 @@
|
|
| 1 |
import React from 'react';
|
| 2 |
|
| 3 |
-
const ReaderParagraph = React.memo(({
|
| 4 |
-
const
|
| 5 |
-
const
|
| 6 |
-
const isHeading = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(type);
|
| 7 |
-
const Tag = isHeading ? type : 'p';
|
| 8 |
|
| 9 |
return (
|
| 10 |
<Tag
|
| 11 |
-
data-chunk-index={index}
|
| 12 |
ref={isActive ? activeParagraphRef : null}
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
style={isActive ? {
|
| 16 |
-
backgroundColor: 'var(--highlight-color)',
|
| 17 |
-
color: '#000000',
|
| 18 |
-
borderRadius: '6px',
|
| 19 |
-
padding: '0.4rem 0.75rem',
|
| 20 |
-
marginLeft: '-0.75rem',
|
| 21 |
-
marginRight: '-0.75rem',
|
| 22 |
-
boxShadow: '0 0 0 6px var(--highlight-color)',
|
| 23 |
-
transition: 'background-color 0.3s ease, box-shadow 0.3s ease'
|
| 24 |
-
} : {}}
|
| 25 |
>
|
| 26 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
</Tag>
|
| 28 |
);
|
| 29 |
});
|
|
|
|
| 1 |
import React from 'react';
|
| 2 |
|
| 3 |
+
const ReaderParagraph = React.memo(({ group, currentChunkIndex, isActive, activeParagraphRef, onClick }) => {
|
| 4 |
+
const isHeading = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(group.type);
|
| 5 |
+
const Tag = isHeading ? group.type : 'p';
|
|
|
|
|
|
|
| 6 |
|
| 7 |
return (
|
| 8 |
<Tag
|
|
|
|
| 9 |
ref={isActive ? activeParagraphRef : null}
|
| 10 |
+
className={`reader-paragraph ${isActive ? 'active-paragraph-container' : ''}`}
|
| 11 |
+
style={{ position: 'relative' }}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
>
|
| 13 |
+
{group.items.map((item, idx) => {
|
| 14 |
+
const isCurrentChunk = item.globalIndex === currentChunkIndex;
|
| 15 |
+
const cleanText = item.text.replace(/ /g, ' ');
|
| 16 |
+
|
| 17 |
+
return (
|
| 18 |
+
<span
|
| 19 |
+
key={item.globalIndex}
|
| 20 |
+
data-chunk-index={item.globalIndex}
|
| 21 |
+
onClick={(e) => { e.stopPropagation(); onClick(item.globalIndex); }}
|
| 22 |
+
className={`sentence-span ${isCurrentChunk ? 'active' : ''}`}
|
| 23 |
+
style={isCurrentChunk ? {
|
| 24 |
+
backgroundColor: 'var(--highlight-color)',
|
| 25 |
+
color: '#000000',
|
| 26 |
+
borderRadius: '4px',
|
| 27 |
+
padding: '0 0.2rem',
|
| 28 |
+
boxShadow: '0 0 0 2px var(--highlight-color)',
|
| 29 |
+
transition: 'background-color 0.2s ease, box-shadow 0.2s ease',
|
| 30 |
+
cursor: 'pointer'
|
| 31 |
+
} : { cursor: 'pointer', transition: 'background-color 0.2s ease' }}
|
| 32 |
+
>
|
| 33 |
+
{cleanText}{idx < group.items.length - 1 ? ' ' : ''}
|
| 34 |
+
</span>
|
| 35 |
+
);
|
| 36 |
+
})}
|
| 37 |
</Tag>
|
| 38 |
);
|
| 39 |
});
|
App/frontend/src/workers/parsingWorker.js
CHANGED
|
@@ -14,11 +14,16 @@ self.onmessage = function(e) {
|
|
| 14 |
};
|
| 15 |
|
| 16 |
function splitText(text) {
|
|
|
|
| 17 |
return text.split(/\n\s*\n/).map(s => s.trim()).filter(Boolean).flatMap(text => {
|
| 18 |
const isHeader = text.length < 80 && !text.endsWith('.') && !text.endsWith(',') && !text.endsWith(';') && !text.endsWith(':');
|
| 19 |
-
if (isHeader)
|
|
|
|
|
|
|
| 20 |
// Split long paragraphs at sentence boundaries to keep TTS requests fast
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
});
|
| 23 |
}
|
| 24 |
|
|
@@ -27,6 +32,7 @@ function extractFromHtml(html) {
|
|
| 27 |
const chunks = [];
|
| 28 |
const tags = /<(h[1-6]|p|blockquote|li)\b[^>]*>([\s\S]*?)<\/\1>/gi;
|
| 29 |
let match;
|
|
|
|
| 30 |
|
| 31 |
while ((match = tags.exec(html)) !== null) {
|
| 32 |
let type = match[1].toLowerCase();
|
|
@@ -38,7 +44,16 @@ function extractFromHtml(html) {
|
|
| 38 |
.replace(/\s+/g, ' ')
|
| 39 |
.trim();
|
| 40 |
if (text) {
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
}
|
| 43 |
}
|
| 44 |
|
|
@@ -46,7 +61,7 @@ function extractFromHtml(html) {
|
|
| 46 |
if (chunks.length === 0) {
|
| 47 |
let cleanText = html.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
|
| 48 |
if (cleanText) {
|
| 49 |
-
splitIntoSentences(cleanText).forEach(t => chunks.push({ type: 'p', text: t }));
|
| 50 |
}
|
| 51 |
}
|
| 52 |
|
|
@@ -57,7 +72,7 @@ function extractFromHtml(html) {
|
|
| 57 |
* Splits a long paragraph into sub-chunks at sentence boundaries,
|
| 58 |
* keeping each chunk under MAX_CHUNK_CHARS for fast TTS synthesis.
|
| 59 |
*/
|
| 60 |
-
const MAX_CHUNK_CHARS =
|
| 61 |
function splitIntoSentences(text) {
|
| 62 |
if (text.length <= MAX_CHUNK_CHARS) return [text];
|
| 63 |
|
|
@@ -71,7 +86,7 @@ function splitIntoSentences(text) {
|
|
| 71 |
const end = match.index + 1;
|
| 72 |
const candidate = text.slice(lastIndex, end).trim();
|
| 73 |
// Only commit a split if the candidate is a reasonable size
|
| 74 |
-
if (candidate.length >=
|
| 75 |
if (candidate) parts.push(candidate);
|
| 76 |
lastIndex = end;
|
| 77 |
}
|
|
|
|
| 14 |
};
|
| 15 |
|
| 16 |
function splitText(text) {
|
| 17 |
+
let paragraphId = 0;
|
| 18 |
return text.split(/\n\s*\n/).map(s => s.trim()).filter(Boolean).flatMap(text => {
|
| 19 |
const isHeader = text.length < 80 && !text.endsWith('.') && !text.endsWith(',') && !text.endsWith(';') && !text.endsWith(':');
|
| 20 |
+
if (isHeader) {
|
| 21 |
+
return [{ type: 'h2', text, paragraphId: paragraphId++ }];
|
| 22 |
+
}
|
| 23 |
// Split long paragraphs at sentence boundaries to keep TTS requests fast
|
| 24 |
+
const sentences = splitIntoSentences(text).map(t => ({ type: 'p', text: t, paragraphId }));
|
| 25 |
+
paragraphId++;
|
| 26 |
+
return sentences;
|
| 27 |
});
|
| 28 |
}
|
| 29 |
|
|
|
|
| 32 |
const chunks = [];
|
| 33 |
const tags = /<(h[1-6]|p|blockquote|li)\b[^>]*>([\s\S]*?)<\/\1>/gi;
|
| 34 |
let match;
|
| 35 |
+
let paragraphId = 0;
|
| 36 |
|
| 37 |
while ((match = tags.exec(html)) !== null) {
|
| 38 |
let type = match[1].toLowerCase();
|
|
|
|
| 44 |
.replace(/\s+/g, ' ')
|
| 45 |
.trim();
|
| 46 |
if (text) {
|
| 47 |
+
if (type.match(/^h[1-6]$/)) {
|
| 48 |
+
// preserve headers whole unless incredibly long
|
| 49 |
+
chunks.push({ type, text, paragraphId: paragraphId++ });
|
| 50 |
+
} else {
|
| 51 |
+
// Break long paragraphs into sub-sentence chunks for fast TTS
|
| 52 |
+
splitIntoSentences(text).forEach(chunkText => {
|
| 53 |
+
chunks.push({ type, text: chunkText, paragraphId });
|
| 54 |
+
});
|
| 55 |
+
paragraphId++;
|
| 56 |
+
}
|
| 57 |
}
|
| 58 |
}
|
| 59 |
|
|
|
|
| 61 |
if (chunks.length === 0) {
|
| 62 |
let cleanText = html.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
|
| 63 |
if (cleanText) {
|
| 64 |
+
splitIntoSentences(cleanText).forEach(t => chunks.push({ type: 'p', text: t, paragraphId }));
|
| 65 |
}
|
| 66 |
}
|
| 67 |
|
|
|
|
| 72 |
* Splits a long paragraph into sub-chunks at sentence boundaries,
|
| 73 |
* keeping each chunk under MAX_CHUNK_CHARS for fast TTS synthesis.
|
| 74 |
*/
|
| 75 |
+
const MAX_CHUNK_CHARS = 550;
|
| 76 |
function splitIntoSentences(text) {
|
| 77 |
if (text.length <= MAX_CHUNK_CHARS) return [text];
|
| 78 |
|
|
|
|
| 86 |
const end = match.index + 1;
|
| 87 |
const candidate = text.slice(lastIndex, end).trim();
|
| 88 |
// Only commit a split if the candidate is a reasonable size
|
| 89 |
+
if (candidate.length >= 40 || (parts.length > 0 && parts[parts.length - 1].length + candidate.length > MAX_CHUNK_CHARS)) {
|
| 90 |
if (candidate) parts.push(candidate);
|
| 91 |
lastIndex = end;
|
| 92 |
}
|