Spaces:

k-l-lambda
/

LilyScript

Running

App Files Files Community

k-l-lambda commited on 12 days ago

Commit

764d3da

1 Parent(s): 4fb4554

load model online

Browse files

Files changed (2) hide show

app.py +29 -6
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -10,7 +10,8 @@ Right column:
 Generation streams patch-by-patch: raw decoded text (with `[r:x/y]` stream
 markers) goes to the run log, while the measure-segmented postprocessed text
 fills the editor. The backend is the int8 + two-level KV-cache ONNX generator
-(see lilyscript/generator.py); models load from a local dir for now.
 """
 import os
@@ -26,9 +27,13 @@ from lilyscript.generator import StreamingLilyletGenerator
 from lilyscript.postprocess import postprocess
 HERE = os.path.dirname(os.path.abspath(__file__))
-# TODO: swap for huggingface_hub.snapshot_download(repo_id=...) to pull the int8
-# ONNX weights from the hub instead of a local dir.
-MODEL_DIR = os.environ.get('LILYSCRIPT_MODEL_DIR', os.path.join(HERE, 'models'))
 ASSET_DIR = os.path.join(HERE, 'assets')
 EXAMPLES_DIR = os.path.join(HERE, 'examples')
 OUTPUT_DIR = os.path.join(HERE, 'outputs')
@@ -92,13 +97,31 @@ def _init_logging ():
 _init_logging()
 def get_generator ():
 	'''Lazily build the (heavy) ONNX generator on first use.'''
 	global _GEN
 	if _GEN is None:
-		LOG.info('loading ONNX generator from %s ...', MODEL_DIR)
 		t0 = time.perf_counter()
-		_GEN = StreamingLilyletGenerator(MODEL_DIR, ASSET_DIR)
 		LOG.info('generator ready (%.1fs)', time.perf_counter() - t0)
 	return _GEN

 Generation streams patch-by-patch: raw decoded text (with `[r:x/y]` stream
 markers) goes to the run log, while the measure-segmented postprocessed text
 fills the editor. The backend is the int8 + two-level KV-cache ONNX generator
+(see lilyscript/generator.py); weights are pulled from the HF model repo
+`k-l-lambda/LilyNota` on first use (override with LILYSCRIPT_MODEL_DIR locally).
 """
 import os
 from lilyscript.postprocess import postprocess
 HERE = os.path.dirname(os.path.abspath(__file__))
+# Model weights are pulled from the HuggingFace model repo `k-l-lambda/LilyNota`
+# at first use (the int8 + KV-cache ONNX bundle lives under its `onnx/` dir).
+# For local development, point LILYSCRIPT_MODEL_DIR at a local onnx dir to skip
+# the download.
+HF_MODEL_REPO = os.environ.get('LILYSCRIPT_MODEL_REPO', 'k-l-lambda/LilyNota')
+HF_MODEL_SUBDIR = 'onnx'		# weights + geometry + tokenizer live here in the repo
+MODEL_DIR = os.environ.get('LILYSCRIPT_MODEL_DIR')		# set -> use this local dir instead of the hub
 ASSET_DIR = os.path.join(HERE, 'assets')
 EXAMPLES_DIR = os.path.join(HERE, 'examples')
 OUTPUT_DIR = os.path.join(HERE, 'outputs')
 _init_logging()
+def resolve_model_dir ():
+	'''Where the ONNX weights live. If LILYSCRIPT_MODEL_DIR is set, use it as-is
+	(local dev). Otherwise pull the `onnx/` bundle from the HF model repo and
+	return its local snapshot path. The tokenizer is NOT pulled — it's read from
+	the app's own assets/ dir — so we only fetch the weight files.'''
+	if MODEL_DIR:
+		return MODEL_DIR
+	from huggingface_hub import snapshot_download
+	LOG.info('downloading model weights from hf:%s (%s/) ...', HF_MODEL_REPO, HF_MODEL_SUBDIR)
+	local = snapshot_download(
+		repo_id=HF_MODEL_REPO,
+		allow_patterns=[f'{HF_MODEL_SUBDIR}/patch_kv_int8.onnx', f'{HF_MODEL_SUBDIR}/token_kv_int8.onnx',
+			f'{HF_MODEL_SUBDIR}/wte.npy', f'{HF_MODEL_SUBDIR}/geometry.json'],
+	)
+	return os.path.join(local, HF_MODEL_SUBDIR)
 def get_generator ():
 	'''Lazily build the (heavy) ONNX generator on first use.'''
 	global _GEN
 	if _GEN is None:
+		model_dir = resolve_model_dir()
+		LOG.info('loading ONNX generator from %s ...', model_dir)
 		t0 = time.perf_counter()
+		_GEN = StreamingLilyletGenerator(model_dir, ASSET_DIR)
 		LOG.info('generator ready (%.1fs)', time.perf_counter() - t0)
 	return _GEN

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 gradio==6.18.0
 onnxruntime
 numpy

 gradio==6.18.0
 onnxruntime
 numpy
+huggingface-hub