hazardtln commited on
Commit
169c902
·
1 Parent(s): 129f452

Fix: Grouped sentence chunks into visual paragraphs and improved TTS response speed

Browse files
App/backend/supertonic_engine.py CHANGED
@@ -95,16 +95,24 @@ class SupertonicVoice:
95
 
96
  opts = ort.SessionOptions()
97
  providers = ort.get_available_providers()
98
- # Prefer GPU if available, fallback to CPU
 
 
99
  if "CUDAExecutionProvider" in providers:
100
- providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
101
- else:
102
- providers = ["CPUExecutionProvider"]
103
 
104
- self.dp_ort = ort.InferenceSession(os.path.join(onnx_dir, "duration_predictor.onnx"), sess_options=opts, providers=providers)
105
- self.text_enc_ort = ort.InferenceSession(os.path.join(onnx_dir, "text_encoder.onnx"), sess_options=opts, providers=providers)
106
- self.vector_est_ort = ort.InferenceSession(os.path.join(onnx_dir, "vector_estimator.onnx"), sess_options=opts, providers=providers)
107
- self.vocoder_ort = ort.InferenceSession(os.path.join(onnx_dir, "vocoder.onnx"), sess_options=opts, providers=providers)
 
 
 
 
 
 
 
108
 
109
  self.sample_rate = self.cfgs["ae"]["sample_rate"]
110
  self.base_chunk_size = self.cfgs["ae"]["base_chunk_size"]
 
95
 
96
  opts = ort.SessionOptions()
97
  providers = ort.get_available_providers()
98
+
99
+ # Prefer GPU (CUDA), then CPU
100
+ active_providers = []
101
  if "CUDAExecutionProvider" in providers:
102
+ active_providers.append("CUDAExecutionProvider")
103
+ active_providers.append("CPUExecutionProvider")
 
104
 
105
+ try:
106
+ self.dp_ort = ort.InferenceSession(os.path.join(onnx_dir, "duration_predictor.onnx"), sess_options=opts, providers=active_providers)
107
+ self.text_enc_ort = ort.InferenceSession(os.path.join(onnx_dir, "text_encoder.onnx"), sess_options=opts, providers=active_providers)
108
+ self.vector_est_ort = ort.InferenceSession(os.path.join(onnx_dir, "vector_estimator.onnx"), sess_options=opts, providers=active_providers)
109
+ self.vocoder_ort = ort.InferenceSession(os.path.join(onnx_dir, "vocoder.onnx"), sess_options=opts, providers=active_providers)
110
+ except Exception as e:
111
+ print(f"Warning: Failed to load with requested GPU providers. Falling back to CPU. ({e})")
112
+ self.dp_ort = ort.InferenceSession(os.path.join(onnx_dir, "duration_predictor.onnx"), sess_options=opts, providers=["CPUExecutionProvider"])
113
+ self.text_enc_ort = ort.InferenceSession(os.path.join(onnx_dir, "text_encoder.onnx"), sess_options=opts, providers=["CPUExecutionProvider"])
114
+ self.vector_est_ort = ort.InferenceSession(os.path.join(onnx_dir, "vector_estimator.onnx"), sess_options=opts, providers=["CPUExecutionProvider"])
115
+ self.vocoder_ort = ort.InferenceSession(os.path.join(onnx_dir, "vocoder.onnx"), sess_options=opts, providers=["CPUExecutionProvider"])
116
 
117
  self.sample_rate = self.cfgs["ae"]["sample_rate"]
118
  self.base_chunk_size = self.cfgs["ae"]["base_chunk_size"]
App/frontend/src/App.jsx CHANGED
@@ -260,7 +260,7 @@ function App() {
260
 
261
  // Playback Logic
262
  const prefetchNextChunks = async (currentIndex) => {
263
- const PREFETCH_COUNT = 2;
264
  const apiUrl = import.meta.env.VITE_API_URL || '';
265
  const voice_id = speechEngine === 'supertonic' ? supertonicVoice : piperVoice;
266
  const speedParam = (speechEngine === 'supertonic' && useNativeSpeed) ? speechRate : 1.0;
@@ -614,6 +614,20 @@ function App() {
614
  }, [view, chunks.length, isPlaying]);
615
 
616
  // Memoized lists
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617
  const filteredBooks = useMemo(() => {
618
  if (!books || !Array.isArray(books)) return [];
619
  let list = books
@@ -854,9 +868,19 @@ function App() {
854
 
855
  <div className="reader-content" onClick={() => { setShowSettings(false); setShowTOC(false); setShowBookmarks(false); }}>
856
  <div className="book-page">
857
- {chunks.map((c, i) => (
858
- <ReaderParagraph key={i} chunk={c} index={i} isActive={i === currentChunkIndex} activeParagraphRef={activeParagraphRef} onClick={handleParagraphClick} />
859
- ))}
 
 
 
 
 
 
 
 
 
 
860
  </div>
861
  </div>
862
 
 
260
 
261
  // Playback Logic
262
  const prefetchNextChunks = async (currentIndex) => {
263
+ const PREFETCH_COUNT = 3;
264
  const apiUrl = import.meta.env.VITE_API_URL || '';
265
  const voice_id = speechEngine === 'supertonic' ? supertonicVoice : piperVoice;
266
  const speedParam = (speechEngine === 'supertonic' && useNativeSpeed) ? speechRate : 1.0;
 
614
  }, [view, chunks.length, isPlaying]);
615
 
616
  // Memoized lists
617
+ const paragraphGroups = useMemo(() => {
618
+ const groups = [];
619
+ let currentGroup = null;
620
+ chunks.forEach((chunk, i) => {
621
+ const pId = chunk.paragraphId !== undefined ? chunk.paragraphId : i;
622
+ if (!currentGroup || currentGroup.id !== pId) {
623
+ currentGroup = { id: pId, type: typeof chunk === 'string' ? 'p' : (chunk.type || 'p'), items: [] };
624
+ groups.push(currentGroup);
625
+ }
626
+ currentGroup.items.push({ text: typeof chunk === 'string' ? chunk : chunk.text, globalIndex: i });
627
+ });
628
+ return groups;
629
+ }, [chunks]);
630
+
631
  const filteredBooks = useMemo(() => {
632
  if (!books || !Array.isArray(books)) return [];
633
  let list = books
 
868
 
869
  <div className="reader-content" onClick={() => { setShowSettings(false); setShowTOC(false); setShowBookmarks(false); }}>
870
  <div className="book-page">
871
+ {paragraphGroups.map((group, i) => {
872
+ const isActiveParagraph = group.items.some(item => item.globalIndex === currentChunkIndex);
873
+ return (
874
+ <ReaderParagraph
875
+ key={i}
876
+ group={group}
877
+ currentChunkIndex={currentChunkIndex}
878
+ isActive={isActiveParagraph}
879
+ activeParagraphRef={isActiveParagraph ? activeParagraphRef : null}
880
+ onClick={handleParagraphClick}
881
+ />
882
+ );
883
+ })}
884
  </div>
885
  </div>
886
 
App/frontend/src/components/AudioDock.jsx CHANGED
@@ -45,7 +45,6 @@ export default function AudioDock({
45
  {formatTimeLeft(timeLeft)}
46
  </span>
47
  )}
48
- <div className="dock-subtitle">Powered by Piper</div>
49
  </div>
50
  </div>
51
 
 
45
  {formatTimeLeft(timeLeft)}
46
  </span>
47
  )}
 
48
  </div>
49
  </div>
50
 
App/frontend/src/components/ReaderParagraph.jsx CHANGED
@@ -1,29 +1,39 @@
1
  import React from 'react';
2
 
3
- const ReaderParagraph = React.memo(({ chunk, index, isActive, activeParagraphRef, onClick }) => {
4
- const text = typeof chunk === 'string' ? chunk : chunk.text;
5
- const type = typeof chunk === 'string' ? 'p' : (chunk.type || 'p');
6
- const isHeading = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(type);
7
- const Tag = isHeading ? type : 'p';
8
 
9
  return (
10
  <Tag
11
- data-chunk-index={index}
12
  ref={isActive ? activeParagraphRef : null}
13
- onClick={() => onClick(index)}
14
- className={`reader-paragraph ${isActive ? 'active' : ''}`}
15
- style={isActive ? {
16
- backgroundColor: 'var(--highlight-color)',
17
- color: '#000000',
18
- borderRadius: '6px',
19
- padding: '0.4rem 0.75rem',
20
- marginLeft: '-0.75rem',
21
- marginRight: '-0.75rem',
22
- boxShadow: '0 0 0 6px var(--highlight-color)',
23
- transition: 'background-color 0.3s ease, box-shadow 0.3s ease'
24
- } : {}}
25
  >
26
- {text.replace(/&nbsp;/g, ' ')}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  </Tag>
28
  );
29
  });
 
1
  import React from 'react';
2
 
3
+ const ReaderParagraph = React.memo(({ group, currentChunkIndex, isActive, activeParagraphRef, onClick }) => {
4
+ const isHeading = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(group.type);
5
+ const Tag = isHeading ? group.type : 'p';
 
 
6
 
7
  return (
8
  <Tag
 
9
  ref={isActive ? activeParagraphRef : null}
10
+ className={`reader-paragraph ${isActive ? 'active-paragraph-container' : ''}`}
11
+ style={{ position: 'relative' }}
 
 
 
 
 
 
 
 
 
 
12
  >
13
+ {group.items.map((item, idx) => {
14
+ const isCurrentChunk = item.globalIndex === currentChunkIndex;
15
+ const cleanText = item.text.replace(/&nbsp;/g, ' ');
16
+
17
+ return (
18
+ <span
19
+ key={item.globalIndex}
20
+ data-chunk-index={item.globalIndex}
21
+ onClick={(e) => { e.stopPropagation(); onClick(item.globalIndex); }}
22
+ className={`sentence-span ${isCurrentChunk ? 'active' : ''}`}
23
+ style={isCurrentChunk ? {
24
+ backgroundColor: 'var(--highlight-color)',
25
+ color: '#000000',
26
+ borderRadius: '4px',
27
+ padding: '0 0.2rem',
28
+ boxShadow: '0 0 0 2px var(--highlight-color)',
29
+ transition: 'background-color 0.2s ease, box-shadow 0.2s ease',
30
+ cursor: 'pointer'
31
+ } : { cursor: 'pointer', transition: 'background-color 0.2s ease' }}
32
+ >
33
+ {cleanText}{idx < group.items.length - 1 ? ' ' : ''}
34
+ </span>
35
+ );
36
+ })}
37
  </Tag>
38
  );
39
  });
App/frontend/src/workers/parsingWorker.js CHANGED
@@ -14,11 +14,16 @@ self.onmessage = function(e) {
14
  };
15
 
16
  function splitText(text) {
 
17
  return text.split(/\n\s*\n/).map(s => s.trim()).filter(Boolean).flatMap(text => {
18
  const isHeader = text.length < 80 && !text.endsWith('.') && !text.endsWith(',') && !text.endsWith(';') && !text.endsWith(':');
19
- if (isHeader) return [{ type: 'h2', text }];
 
 
20
  // Split long paragraphs at sentence boundaries to keep TTS requests fast
21
- return splitIntoSentences(text).map(t => ({ type: 'p', text: t }));
 
 
22
  });
23
  }
24
 
@@ -27,6 +32,7 @@ function extractFromHtml(html) {
27
  const chunks = [];
28
  const tags = /<(h[1-6]|p|blockquote|li)\b[^>]*>([\s\S]*?)<\/\1>/gi;
29
  let match;
 
30
 
31
  while ((match = tags.exec(html)) !== null) {
32
  let type = match[1].toLowerCase();
@@ -38,7 +44,16 @@ function extractFromHtml(html) {
38
  .replace(/\s+/g, ' ')
39
  .trim();
40
  if (text) {
41
- chunks.push({ type, text });
 
 
 
 
 
 
 
 
 
42
  }
43
  }
44
 
@@ -46,7 +61,7 @@ function extractFromHtml(html) {
46
  if (chunks.length === 0) {
47
  let cleanText = html.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
48
  if (cleanText) {
49
- splitIntoSentences(cleanText).forEach(t => chunks.push({ type: 'p', text: t }));
50
  }
51
  }
52
 
@@ -57,7 +72,7 @@ function extractFromHtml(html) {
57
  * Splits a long paragraph into sub-chunks at sentence boundaries,
58
  * keeping each chunk under MAX_CHUNK_CHARS for fast TTS synthesis.
59
  */
60
- const MAX_CHUNK_CHARS = 220;
61
  function splitIntoSentences(text) {
62
  if (text.length <= MAX_CHUNK_CHARS) return [text];
63
 
@@ -71,7 +86,7 @@ function splitIntoSentences(text) {
71
  const end = match.index + 1;
72
  const candidate = text.slice(lastIndex, end).trim();
73
  // Only commit a split if the candidate is a reasonable size
74
- if (candidate.length >= 60 || (parts.length > 0 && parts[parts.length - 1].length + candidate.length > MAX_CHUNK_CHARS)) {
75
  if (candidate) parts.push(candidate);
76
  lastIndex = end;
77
  }
 
14
  };
15
 
16
  function splitText(text) {
17
+ let paragraphId = 0;
18
  return text.split(/\n\s*\n/).map(s => s.trim()).filter(Boolean).flatMap(text => {
19
  const isHeader = text.length < 80 && !text.endsWith('.') && !text.endsWith(',') && !text.endsWith(';') && !text.endsWith(':');
20
+ if (isHeader) {
21
+ return [{ type: 'h2', text, paragraphId: paragraphId++ }];
22
+ }
23
  // Split long paragraphs at sentence boundaries to keep TTS requests fast
24
+ const sentences = splitIntoSentences(text).map(t => ({ type: 'p', text: t, paragraphId }));
25
+ paragraphId++;
26
+ return sentences;
27
  });
28
  }
29
 
 
32
  const chunks = [];
33
  const tags = /<(h[1-6]|p|blockquote|li)\b[^>]*>([\s\S]*?)<\/\1>/gi;
34
  let match;
35
+ let paragraphId = 0;
36
 
37
  while ((match = tags.exec(html)) !== null) {
38
  let type = match[1].toLowerCase();
 
44
  .replace(/\s+/g, ' ')
45
  .trim();
46
  if (text) {
47
+ if (type.match(/^h[1-6]$/)) {
48
+ // preserve headers whole unless incredibly long
49
+ chunks.push({ type, text, paragraphId: paragraphId++ });
50
+ } else {
51
+ // Break long paragraphs into sub-sentence chunks for fast TTS
52
+ splitIntoSentences(text).forEach(chunkText => {
53
+ chunks.push({ type, text: chunkText, paragraphId });
54
+ });
55
+ paragraphId++;
56
+ }
57
  }
58
  }
59
 
 
61
  if (chunks.length === 0) {
62
  let cleanText = html.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
63
  if (cleanText) {
64
+ splitIntoSentences(cleanText).forEach(t => chunks.push({ type: 'p', text: t, paragraphId }));
65
  }
66
  }
67
 
 
72
  * Splits a long paragraph into sub-chunks at sentence boundaries,
73
  * keeping each chunk under MAX_CHUNK_CHARS for fast TTS synthesis.
74
  */
75
+ const MAX_CHUNK_CHARS = 550;
76
  function splitIntoSentences(text) {
77
  if (text.length <= MAX_CHUNK_CHARS) return [text];
78
 
 
86
  const end = match.index + 1;
87
  const candidate = text.slice(lastIndex, end).trim();
88
  // Only commit a split if the candidate is a reasonable size
89
+ if (candidate.length >= 40 || (parts.length > 0 && parts[parts.length - 1].length + candidate.length > MAX_CHUNK_CHARS)) {
90
  if (candidate) parts.push(candidate);
91
  lastIndex = end;
92
  }