Spaces:

k-l-lambda
/

LilyScript

Running

App Files Files Community

k-l-lambda commited on 11 days ago

Commit

a8785dd

1 Parent(s): 4252956

refined mask_monitor.

Browse files

Files changed (1) hide show

lilyscript/mask_monitor.py +95 -31

lilyscript/mask_monitor.py CHANGED Viewed

@@ -2,34 +2,37 @@
 A lightweight, parse-free counterpart to tools/lilylet_blacklist_gen.py's
 BlacklistMonitor. It does NOT call any parser/oracle and has no Node dependency:
-it simply trusts a pre-discovered 2-gram blacklist and masks the forbidden next
-tokens during sampling. Suitable for the Gradio app's live generation path.
 Wire-compatible with StreamingLilyletGenerator's `monitor` hook:
-  banned()          -> ids to mask for the next draw (blacklist[current 2-gram])
   accept(id)->bool  -> always True (we trust the mask; never re-parse)
-  commit_forced(id) -> advance running text/context for a forced token
 Plus mark()/rollback() so the generator's `[r:0/<measures>]` priming re-sample
-(a probe-then-discard draw) can rewind the running text before the forced redraw.
-The 2-gram context is the last two CONTENT token ids of the marker-stripped
-stream (markers aren't real Lilylet), recomputed from a bounded text suffix so it
-"sees through" `[r:x/y]` markers exactly as the discovery monitor did.
 """
 import os
 import re
 import json
-# A complete stream marker `[r:<digits>/<digits>]`; a trailing partial `[r…`.
-# (Mirror tools/lilylet_blacklist_gen.py: only strip a confirmed `[r` prefix, never
-# a bare `[`, which is real content — a header `[composer …]` or beam `c8[`.)
-_MARKER_COMPLETE = re.compile(r'\[r:\d+/\d*\]')
-_MARKER_PARTIAL_TAIL = re.compile(r'\[r(:(\d+(/\d*)?)?)?$')
-def _clean (raw):
-	return _MARKER_PARTIAL_TAIL.sub('', _MARKER_COMPLETE.sub('', raw))
 def _whitespace_ids (tokenizer):
@@ -71,8 +74,16 @@ class MaskMonitor:
 		self._ws = set(_whitespace_ids(self.tk))
 		self._key_lengths = sorted({len(k) for k in self.blacklist}, reverse=True) if self.blacklist else []
 		self._max_ctx = max(self._key_lengths) if self._key_lengths else 0
-		self.raw = ''
-		self._ctx_ids = []           # last <=_max_ctx content non-whitespace ids
 		self._mark = None
 	def _is_content (self, tid):
@@ -85,19 +96,71 @@ class MaskMonitor:
 		tid = int(tid)
 		return '' if not self._is_content(tid) else self.tk.text_by_id.get(tid, '')
-	def _sync_ctx (self):
-		clean = _clean(self.raw)
-		ids = self.tk.encode(clean[-256:])
-		ctx = [i for i in ids if self._is_ctx(i)]
-		self._ctx_ids = ctx[-self._max_ctx:] if self._max_ctx else []
 	# ---- generator-facing API ----
 	def banned (self):
-		'''Union forbidden sets of every stored key that is a suffix of the context.'''
 		if not self._key_lengths:
 			return ()
 		ctx = self._ctx_ids
 		out = set()
 		for n in self._key_lengths:
 			if n <= len(ctx):
@@ -112,17 +175,18 @@ class MaskMonitor:
 		return True
 	def commit_forced (self, tid):
-		self.raw += self._text(tid)
-		self._sync_ctx()
 	# ---- priming support (probe-then-discard rewind) ----
 	def mark (self):
-		'''Remember the current running text so a discarded probe patch can be undone.'''
-		self._mark = self.raw
 	def rollback (self):
 		'''Rewind to the last mark() (drops a probe patch's effect on the context).'''
 		if self._mark is not None:
-			self.raw = self._mark
-			self._sync_ctx()

 A lightweight, parse-free counterpart to tools/lilylet_blacklist_gen.py's
 BlacklistMonitor. It does NOT call any parser/oracle and has no Node dependency:
+it trusts a pre-discovered variable-length n-gram blacklist and masks the
+forbidden next tokens during sampling. Suitable for the Gradio live-gen path.
 Wire-compatible with StreamingLilyletGenerator's `monitor` hook:
+  banned()          -> ids to mask for the next draw (suffix match on the context)
   accept(id)->bool  -> always True (we trust the mask; never re-parse)
+  commit_forced(id) -> advance the running context for a forced token
 Plus mark()/rollback() so the generator's `[r:0/<measures>]` priming re-sample
+(a probe-then-discard draw) can rewind the context before the forced redraw.
+The context is the last N CONTENT token ids (whitespace dropped, `[r:x/y]` stream
+markers excluded), maintained INCREMENTALLY in id space — no per-token tokenizer
+call. Markers are detected by buffering a potential `[r:…]` run and classifying
+the tiny buffer text with the same regexes _clean uses, so the resulting context
+is identical to clean_for_parse()+tokenize but at near-zero cost.
 """
 import os
 import re
 import json
+# Anchored marker regexes for the id-space state machine — applied to the tiny
+# marker BUFFER text (a candidate `[r:…]` run), never the whole stream:
+#   - COMPLETE `\[r:\d+/\d*\]` -> drop the buffer entirely (a finished marker)
+#   - viable PARTIAL `\[r(:(\d+(/\d*)?)?)?` -> keep buffering; it is excluded from
+#       the context (mirrors discovery's _MARKER_PARTIAL_TAIL at end-of-stream)
+#   - the lone `[` is a viable partial too, but stays VISIBLE as content (a bare
+#       `[` is a header `[composer …]` / beam `c8[`, which _clean keeps)
+#   - anything else -> not a marker -> flush the buffer back into the context
+_MARK_COMPLETE_FULL = re.compile(r'\[r:\d+/\d*\]\Z')
+_MARK_PARTIAL_FULL = re.compile(r'\[r(:(\d+(/\d*)?)?)?\Z')
 def _whitespace_ids (tokenizer):
 		self._ws = set(_whitespace_ids(self.tk))
 		self._key_lengths = sorted({len(k) for k in self.blacklist}, reverse=True) if self.blacklist else []
 		self._max_ctx = max(self._key_lengths) if self._key_lengths else 0
+		self._lbrack = self.tk.encode('[')[0]
+		# Context is maintained INCREMENTALLY in id space — no per-token tokenizer
+		# call. The only thing that needs care is excluding `[r:x/y]` stream markers,
+		# whose chars share ids with real content. We buffer a *potential* marker
+		# (always starting at `[`) and decide using the SAME regexes as _clean applied
+		# to the tiny buffer text (id->char is a cheap dict lookup), so the resulting
+		# context is provably identical to _clean()+encode of the full stream.
+		self._ctx_ids = []        # confirmed visible content ids (whitespace dropped)
+		self._buf = []            # token ids of an in-progress potential marker
+		self._buf_text = ''       # their concatenated text (starts with '[')
 		self._mark = None
 	def _is_content (self, tid):
 		tid = int(tid)
 		return '' if not self._is_content(tid) else self.tk.text_by_id.get(tid, '')
+	def _push (self, tid):
+		self._ctx_ids.append(int(tid))
+		if len(self._ctx_ids) > self._max_ctx:
+			del self._ctx_ids[:-self._max_ctx]
+	def _push_content (self, tid):
+		'''Route a non-marker token into the context: drop whitespace, keep content.'''
+		if self._is_ctx(tid):
+			self._push(tid)
+	def _flush_buf (self):
+		'''The buffered tokens are NOT a marker after all -> they are real content;
+		replay them through the content path, then clear the buffer.'''
+		buf = self._buf
+		self._buf = []
+		self._buf_text = ''
+		for tid in buf:
+			self._push_content(tid)
+	def _feed (self, tid):
+		'''Advance the incremental context with one committed token id.'''
+		tid = int(tid)
+		if not self._is_content(tid):
+			# pad/bos/eos: not content, and breaks any pending marker buffer.
+			if self._buf:
+				self._flush_buf()
+			return
+		if self._buf:
+			# extend the potential marker and re-classify the buffer text.
+			text = self._buf_text + self._text(tid)
+			if _MARK_COMPLETE_FULL.match(text):
+				# a full `[r:x/y]` -> the whole buffer contributes nothing; drop it.
+				self._buf = []
+				self._buf_text = ''
+				return
+			if _MARK_PARTIAL_FULL.match(text):
+				# still a viable marker prefix (`[r`, `[r:`, `[r:5`, `[r:5/`, `[r:5/3`)
+				self._buf.append(tid)
+				self._buf_text = text
+				return
+			# not a marker: flush the buffer as content, then process tid afresh below.
+			self._flush_buf()
+		if tid == self._lbrack:
+			# start a potential marker (a lone `[` is itself real content until/unless
+			# an `r` follows; that visibility is handled in banned()).
+			self._buf = [tid]
+			self._buf_text = '['
+			return
+		self._push_content(tid)
 	# ---- generator-facing API ----
 	def banned (self):
+		'''Union forbidden sets of every stored key that is a suffix of the context.
+		The visible context is _ctx_ids plus a pending lone `[` (which _clean keeps);
+		a longer `[r…` partial buffer is stripped (matches _MARKER_PARTIAL_TAIL).'''
 		if not self._key_lengths:
 			return ()
 		ctx = self._ctx_ids
+		if self._buf_text == '[':
+			ctx = ctx + [self._lbrack]
 		out = set()
 		for n in self._key_lengths:
 			if n <= len(ctx):
 		return True
 	def commit_forced (self, tid):
+		if self._max_ctx:
+			self._feed(tid)
 	# ---- priming support (probe-then-discard rewind) ----
 	def mark (self):
+		'''Snapshot the context so a discarded priming-probe patch can be undone.'''
+		self._mark = (list(self._ctx_ids), list(self._buf), self._buf_text)
 	def rollback (self):
 		'''Rewind to the last mark() (drops a probe patch's effect on the context).'''
 		if self._mark is not None:
+			self._ctx_ids = list(self._mark[0])
+			self._buf = list(self._mark[1])
+			self._buf_text = self._mark[2]