Spaces:

hazardtln
/

eReader

Running

App Files Files Community

hazardtln commited on Mar 27

Commit

169c902

1 Parent(s): 129f452

Fix: Grouped sentence chunks into visual paragraphs and improved TTS response speed

Browse files

Files changed (5) hide show

App/backend/supertonic_engine.py +16 -8
App/frontend/src/App.jsx +28 -4
App/frontend/src/components/AudioDock.jsx +0 -1
App/frontend/src/components/ReaderParagraph.jsx +29 -19
App/frontend/src/workers/parsingWorker.js +21 -6

App/backend/supertonic_engine.py CHANGED Viewed

@@ -95,16 +95,24 @@ class SupertonicVoice:
         opts = ort.SessionOptions()
         providers = ort.get_available_providers()
-        # Prefer GPU if available, fallback to CPU
         if "CUDAExecutionProvider" in providers:
-            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
-        else:
-            providers = ["CPUExecutionProvider"]
-        self.dp_ort = ort.InferenceSession(os.path.join(onnx_dir, "duration_predictor.onnx"), sess_options=opts, providers=providers)
-        self.text_enc_ort = ort.InferenceSession(os.path.join(onnx_dir, "text_encoder.onnx"), sess_options=opts, providers=providers)
-        self.vector_est_ort = ort.InferenceSession(os.path.join(onnx_dir, "vector_estimator.onnx"), sess_options=opts, providers=providers)
-        self.vocoder_ort = ort.InferenceSession(os.path.join(onnx_dir, "vocoder.onnx"), sess_options=opts, providers=providers)
         self.sample_rate = self.cfgs["ae"]["sample_rate"]
         self.base_chunk_size = self.cfgs["ae"]["base_chunk_size"]

         opts = ort.SessionOptions()
         providers = ort.get_available_providers()
+        # Prefer GPU (CUDA), then CPU
+        active_providers = []
         if "CUDAExecutionProvider" in providers:
+            active_providers.append("CUDAExecutionProvider")
+        active_providers.append("CPUExecutionProvider")
+        try:
+            self.dp_ort = ort.InferenceSession(os.path.join(onnx_dir, "duration_predictor.onnx"), sess_options=opts, providers=active_providers)
+            self.text_enc_ort = ort.InferenceSession(os.path.join(onnx_dir, "text_encoder.onnx"), sess_options=opts, providers=active_providers)
+            self.vector_est_ort = ort.InferenceSession(os.path.join(onnx_dir, "vector_estimator.onnx"), sess_options=opts, providers=active_providers)
+            self.vocoder_ort = ort.InferenceSession(os.path.join(onnx_dir, "vocoder.onnx"), sess_options=opts, providers=active_providers)
+        except Exception as e:
+            print(f"Warning: Failed to load with requested GPU providers. Falling back to CPU. ({e})")
+            self.dp_ort = ort.InferenceSession(os.path.join(onnx_dir, "duration_predictor.onnx"), sess_options=opts, providers=["CPUExecutionProvider"])
+            self.text_enc_ort = ort.InferenceSession(os.path.join(onnx_dir, "text_encoder.onnx"), sess_options=opts, providers=["CPUExecutionProvider"])
+            self.vector_est_ort = ort.InferenceSession(os.path.join(onnx_dir, "vector_estimator.onnx"), sess_options=opts, providers=["CPUExecutionProvider"])
+            self.vocoder_ort = ort.InferenceSession(os.path.join(onnx_dir, "vocoder.onnx"), sess_options=opts, providers=["CPUExecutionProvider"])
         self.sample_rate = self.cfgs["ae"]["sample_rate"]
         self.base_chunk_size = self.cfgs["ae"]["base_chunk_size"]

App/frontend/src/App.jsx CHANGED Viewed

@@ -260,7 +260,7 @@ function App() {
   // Playback Logic
   const prefetchNextChunks = async (currentIndex) => {
-    const PREFETCH_COUNT = 2;
     const apiUrl = import.meta.env.VITE_API_URL || '';
     const voice_id = speechEngine === 'supertonic' ? supertonicVoice : piperVoice;
     const speedParam = (speechEngine === 'supertonic' && useNativeSpeed) ? speechRate : 1.0;
@@ -614,6 +614,20 @@ function App() {
   }, [view, chunks.length, isPlaying]);
   // Memoized lists
   const filteredBooks = useMemo(() => {
     if (!books || !Array.isArray(books)) return [];
     let list = books
@@ -854,9 +868,19 @@ function App() {
           <div className="reader-content" onClick={() => { setShowSettings(false); setShowTOC(false); setShowBookmarks(false); }}>
             <div className="book-page">
-              {chunks.map((c, i) => (
-                <ReaderParagraph key={i} chunk={c} index={i} isActive={i === currentChunkIndex} activeParagraphRef={activeParagraphRef} onClick={handleParagraphClick} />
-              ))}
             </div>
           </div>

   // Playback Logic
   const prefetchNextChunks = async (currentIndex) => {
+    const PREFETCH_COUNT = 3;
     const apiUrl = import.meta.env.VITE_API_URL || '';
     const voice_id = speechEngine === 'supertonic' ? supertonicVoice : piperVoice;
     const speedParam = (speechEngine === 'supertonic' && useNativeSpeed) ? speechRate : 1.0;
   }, [view, chunks.length, isPlaying]);
   // Memoized lists
+  const paragraphGroups = useMemo(() => {
+    const groups = [];
+    let currentGroup = null;
+    chunks.forEach((chunk, i) => {
+      const pId = chunk.paragraphId !== undefined ? chunk.paragraphId : i;
+      if (!currentGroup || currentGroup.id !== pId) {
+        currentGroup = { id: pId, type: typeof chunk === 'string' ? 'p' : (chunk.type || 'p'), items: [] };
+        groups.push(currentGroup);
+      }
+      currentGroup.items.push({ text: typeof chunk === 'string' ? chunk : chunk.text, globalIndex: i });
+    });
+    return groups;
+  }, [chunks]);
   const filteredBooks = useMemo(() => {
     if (!books || !Array.isArray(books)) return [];
     let list = books
           <div className="reader-content" onClick={() => { setShowSettings(false); setShowTOC(false); setShowBookmarks(false); }}>
             <div className="book-page">
+              {paragraphGroups.map((group, i) => {
+                const isActiveParagraph = group.items.some(item => item.globalIndex === currentChunkIndex);
+                return (
+                  <ReaderParagraph
+                    key={i}
+                    group={group}
+                    currentChunkIndex={currentChunkIndex}
+                    isActive={isActiveParagraph}
+                    activeParagraphRef={isActiveParagraph ? activeParagraphRef : null}
+                    onClick={handleParagraphClick}
+                  />
+                );
+              })}
             </div>
           </div>

App/frontend/src/components/AudioDock.jsx CHANGED Viewed

@@ -45,7 +45,6 @@ export default function AudioDock({
                 {formatTimeLeft(timeLeft)}
               </span>
             )}
-            <div className="dock-subtitle">Powered by Piper</div>
           </div>
         </div>

                 {formatTimeLeft(timeLeft)}
               </span>
             )}
           </div>
         </div>

App/frontend/src/components/ReaderParagraph.jsx CHANGED Viewed

@@ -1,29 +1,39 @@
 import React from 'react';
-const ReaderParagraph = React.memo(({ chunk, index, isActive, activeParagraphRef, onClick }) => {
-  const text = typeof chunk === 'string' ? chunk : chunk.text;
-  const type = typeof chunk === 'string' ? 'p' : (chunk.type || 'p');
-  const isHeading = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(type);
-  const Tag = isHeading ? type : 'p';
   return (
     <Tag
-      data-chunk-index={index}
       ref={isActive ? activeParagraphRef : null}
-      onClick={() => onClick(index)}
-      className={`reader-paragraph ${isActive ? 'active' : ''}`}
-      style={isActive ? {
-        backgroundColor: 'var(--highlight-color)',
-        color: '#000000',
-        borderRadius: '6px',
-        padding: '0.4rem 0.75rem',
-        marginLeft: '-0.75rem',
-        marginRight: '-0.75rem',
-        boxShadow: '0 0 0 6px var(--highlight-color)',
-        transition: 'background-color 0.3s ease, box-shadow 0.3s ease'
-      } : {}}
     >
-      {text.replace(/&nbsp;/g, ' ')}
     </Tag>
   );
 });

 import React from 'react';
+const ReaderParagraph = React.memo(({ group, currentChunkIndex, isActive, activeParagraphRef, onClick }) => {
+  const isHeading = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(group.type);
+  const Tag = isHeading ? group.type : 'p';
   return (
     <Tag
       ref={isActive ? activeParagraphRef : null}
+      className={`reader-paragraph ${isActive ? 'active-paragraph-container' : ''}`}
+      style={{ position: 'relative' }}
     >
+      {group.items.map((item, idx) => {
+        const isCurrentChunk = item.globalIndex === currentChunkIndex;
+        const cleanText = item.text.replace(/&nbsp;/g, ' ');
+        return (
+          <span
+            key={item.globalIndex}
+            data-chunk-index={item.globalIndex}
+            onClick={(e) => { e.stopPropagation(); onClick(item.globalIndex); }}
+            className={`sentence-span ${isCurrentChunk ? 'active' : ''}`}
+            style={isCurrentChunk ? {
+              backgroundColor: 'var(--highlight-color)',
+              color: '#000000',
+              borderRadius: '4px',
+              padding: '0 0.2rem',
+              boxShadow: '0 0 0 2px var(--highlight-color)',
+              transition: 'background-color 0.2s ease, box-shadow 0.2s ease',
+              cursor: 'pointer'
+            } : { cursor: 'pointer', transition: 'background-color 0.2s ease' }}
+          >
+            {cleanText}{idx < group.items.length - 1 ? ' ' : ''}
+          </span>
+        );
+      })}
     </Tag>
   );
 });

App/frontend/src/workers/parsingWorker.js CHANGED Viewed

@@ -14,11 +14,16 @@ self.onmessage = function(e) {
 };
 function splitText(text) {
   return text.split(/\n\s*\n/).map(s => s.trim()).filter(Boolean).flatMap(text => {
     const isHeader = text.length < 80 && !text.endsWith('.') && !text.endsWith(',') && !text.endsWith(';') && !text.endsWith(':');
-    if (isHeader) return [{ type: 'h2', text }];
     // Split long paragraphs at sentence boundaries to keep TTS requests fast
-    return splitIntoSentences(text).map(t => ({ type: 'p', text: t }));
   });
 }
@@ -27,6 +32,7 @@ function extractFromHtml(html) {
   const chunks = [];
   const tags = /<(h[1-6]|p|blockquote|li)\b[^>]*>([\s\S]*?)<\/\1>/gi;
   let match;
   while ((match = tags.exec(html)) !== null) {
     let type = match[1].toLowerCase();
@@ -38,7 +44,16 @@ function extractFromHtml(html) {
                        .replace(/\s+/g, ' ')
                        .trim();
     if (text) {
-      chunks.push({ type, text });
     }
   }
@@ -46,7 +61,7 @@ function extractFromHtml(html) {
   if (chunks.length === 0) {
     let cleanText = html.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
     if (cleanText) {
-      splitIntoSentences(cleanText).forEach(t => chunks.push({ type: 'p', text: t }));
     }
   }
@@ -57,7 +72,7 @@ function extractFromHtml(html) {
  * Splits a long paragraph into sub-chunks at sentence boundaries,
  * keeping each chunk under MAX_CHUNK_CHARS for fast TTS synthesis.
  */
-const MAX_CHUNK_CHARS = 220;
 function splitIntoSentences(text) {
   if (text.length <= MAX_CHUNK_CHARS) return [text];
@@ -71,7 +86,7 @@ function splitIntoSentences(text) {
     const end = match.index + 1;
     const candidate = text.slice(lastIndex, end).trim();
     // Only commit a split if the candidate is a reasonable size
-    if (candidate.length >= 60 || (parts.length > 0 && parts[parts.length - 1].length + candidate.length > MAX_CHUNK_CHARS)) {
       if (candidate) parts.push(candidate);
       lastIndex = end;
     }

 };
 function splitText(text) {
+  let paragraphId = 0;
   return text.split(/\n\s*\n/).map(s => s.trim()).filter(Boolean).flatMap(text => {
     const isHeader = text.length < 80 && !text.endsWith('.') && !text.endsWith(',') && !text.endsWith(';') && !text.endsWith(':');
+    if (isHeader) {
+      return [{ type: 'h2', text, paragraphId: paragraphId++ }];
+    }
     // Split long paragraphs at sentence boundaries to keep TTS requests fast
+    const sentences = splitIntoSentences(text).map(t => ({ type: 'p', text: t, paragraphId }));
+    paragraphId++;
+    return sentences;
   });
 }
   const chunks = [];
   const tags = /<(h[1-6]|p|blockquote|li)\b[^>]*>([\s\S]*?)<\/\1>/gi;
   let match;
+  let paragraphId = 0;
   while ((match = tags.exec(html)) !== null) {
     let type = match[1].toLowerCase();
                        .replace(/\s+/g, ' ')
                        .trim();
     if (text) {
+      if (type.match(/^h[1-6]$/)) {
+        // preserve headers whole unless incredibly long
+        chunks.push({ type, text, paragraphId: paragraphId++ });
+      } else {
+        // Break long paragraphs into sub-sentence chunks for fast TTS
+        splitIntoSentences(text).forEach(chunkText => {
+          chunks.push({ type, text: chunkText, paragraphId });
+        });
+        paragraphId++;
+      }
     }
   }
   if (chunks.length === 0) {
     let cleanText = html.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
     if (cleanText) {
+      splitIntoSentences(cleanText).forEach(t => chunks.push({ type: 'p', text: t, paragraphId }));
     }
   }
  * Splits a long paragraph into sub-chunks at sentence boundaries,
  * keeping each chunk under MAX_CHUNK_CHARS for fast TTS synthesis.
  */
+const MAX_CHUNK_CHARS = 550;
 function splitIntoSentences(text) {
   if (text.length <= MAX_CHUNK_CHARS) return [text];
     const end = match.index + 1;
     const candidate = text.slice(lastIndex, end).trim();
     // Only commit a split if the candidate is a reasonable size
+    if (candidate.length >= 40 || (parts.length > 0 && parts[parts.length - 1].length + candidate.length > MAX_CHUNK_CHARS)) {
       if (candidate) parts.push(candidate);
       lastIndex = end;
     }