| | import { pipeline, AutoTokenizer, AutoModel, TokenizerModel, PreTrainedTokenizer } from '@huggingface/transformers'; |
| | import fs from 'node:fs/promises'; |
| | import { constants } from 'node:fs'; |
| | import path from 'path'; |
| | import { fileURLToPath } from 'url'; |
| |
|
| | const DIR = path.dirname(fileURLToPath(import.meta.url)); |
| |
|
| | await main() |
| |
|
| | async function main() { |
| | const url = "https://huggingface.co/sentence-transformers/static-similarity-mrl-multilingual-v1/resolve/main/0_StaticEmbedding/tokenizer.json" |
| |
|
| | const config = await ensureTokenizerJson(url) |
| | const tokenizer = new PreTrainedTokenizer(config, {}) |
| | |
| | const examples = [ |
| | "This is an example of encoding", |
| | "The quick brown fox jumps over the lazy dog.", |
| | "Curaçao, naïve fiancé, jalapeño, déjà vu.", |
| | "Привет, как дела?", |
| | "Бързата кафява лисица прескача мързеливото куче.", |
| | "Γρήγορη καφέ αλεπού πηδάει πάνω από τον τεμπέλη σκύλο.", |
| | "اللغة العربية جميلة وغنية بالتاريخ.", |
| | "مرحبا بالعالم!", |
| | "Simplified: 快速的棕色狐狸跳过懒狗。", |
| | "Traditional: 快速的棕色狐狸跳過懶狗。", |
| | "素早い茶色の狐が怠け者の犬を飛び越える。", |
| | "コンピュータープログラミング", |
| | "빠른 갈색 여우가 게으른 개를 뛰어넘습니다.", |
| | "तेज़ भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है।", |
| | "দ্রুত বাদামী শিয়াল অলস কুকুরের উপর দিয়ে লাফ দেয়।", |
| | "வேகமான பழுப்பு நரி சோம்பேறி நாயின் மேல் குதிக்கிறது.", |
| | "สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ.", |
| | "ብሩክ ቡናማ ቀበሮ ሰነፍ ውሻን ተዘልሏል።", |
| | |
| | "Hello 世界 مرحبا 🌍", |
| | "123, αβγ, абв, العربية, 中文, हिन्दी.", |
| | ]; |
| | for (const example of examples) { |
| | console.log(tokenizer.tokenize(example)) |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | async function loadJSON(path) { |
| | return JSON.parse(await fs.readFile(path, { encoding: 'utf8' })); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | export async function ensureTokenizerJson(url) { |
| | const tokenizerPath = path.join(DIR, 'tokenizer.json'); |
| |
|
| | try { |
| | await fs.access(tokenizerPath, constants.F_OK); |
| | console.log('Using', tokenizerPath); |
| | return loadJSON(tokenizerPath); |
| | } catch {} |
| | |
| | console.log("Downloading", url); |
| | const response = await fetch(url); |
| | const data = Buffer.from(await response.arrayBuffer()); |
| | await fs.writeFile(tokenizerPath, data); |
| | |
| | return loadJSON(tokenizerPath); |
| |
|
| | } |
| |
|