Buckets:

rtrm's picture
download
raw
97.1 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Byte-Pair Encoding tokenization&quot;,&quot;local&quot;:&quot;byte-pair-encoding-tokenization&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;อัลกอริทึมที่ใช้ในการเทรน&quot;,&quot;local&quot;:&quot;อลกอรทมทใชในการเทรน&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Tokenization algorithm&quot;,&quot;local&quot;:&quot;tokenization-algorithm&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;การสร้าง BPE (Implementing BPE)&quot;,&quot;local&quot;:&quot;การสราง-bpe-implementing-bpe&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1069/th/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/entry/start.eeb02c13.js">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/chunks/scheduler.37c15a92.js">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/chunks/singletons.d692cc64.js">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/chunks/index.18351ede.js">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/chunks/paths.f4699845.js">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/entry/app.25e78aa9.js">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/chunks/index.2bf4358c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/nodes/0.d0785757.js">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/nodes/40.31dc83d6.js">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/chunks/Tip.363c041f.js">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/chunks/Youtube.1e50a667.js">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/chunks/CodeBlock.4e987730.js">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/chunks/CourseFloatingBanner.6add7356.js">
<link rel="modulepreload" href="/docs/course/pr_1069/th/_app/immutable/chunks/getInferenceSnippets.24b50994.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Byte-Pair Encoding tokenization&quot;,&quot;local&quot;:&quot;byte-pair-encoding-tokenization&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;อัลกอริทึมที่ใช้ในการเทรน&quot;,&quot;local&quot;:&quot;อลกอรทมทใชในการเทรน&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Tokenization algorithm&quot;,&quot;local&quot;:&quot;tokenization-algorithm&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;การสร้าง BPE (Implementing BPE)&quot;,&quot;local&quot;:&quot;การสราง-bpe-implementing-bpe&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="byte-pair-encoding-tokenization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#byte-pair-encoding-tokenization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Byte-Pair Encoding tokenization</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-6-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/th/chapter6/section5.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/th/chapter6/section5.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-12o0prt">ดั้งเดิมแล้ว Byte-Pair Encoding (BPE) เป็นอัลกอริทึมที่ถูกสร้างเพื่อใช้บีบอัดข้อความให้เล็กลง (compress texts) ภายหลัง OpenAI ได้นำอัลกอริทึมนี้มาใช้ในการตัดคำ ในขั้นตอนเตรียมข้อมูลเพื่อเทรน GPT อัลกอริทึมตัวนี้ยังถูกนำมาใช้อย่างกว้างขวางกับโมเดลประเภท Transformer เช่น GPT, GPT-2, RoBERTa, BART, และ DeBERTa</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/HEikzVL-lZU" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-waucbj">💡 บทนี้จะพูดถึง BPE อย่างละเอียด เราจะเจาะลึกถึงไปถึงการ implement อัลกอริทึมนี้ คุณสามารถข้ามไปตอนท้ายได้ ถ้าคุณสนใจเพียงแค่ภาพรวมคร่าวๆเท่านั้น</p></div> <h2 class="relative group"><a id="อลกอรทมทใชในการเทรน" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#อลกอรทมทใชในการเทรน"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>อัลกอริทึมที่ใช้ในการเทรน</span></h2> <p data-svelte-h="svelte-ui9660">BPE เริ่มการเทรนด้วยการคำนวณรายการของคำที่อยู่ในคลังข้อมูล (คลังข้อมูลจะต้องผ่านการ normalization และ pre-tokenization มาแล้ว) จากนั้นมันจะเริ่มสร้างชุดคำศัพท์ (vocabulary) จากตัวอักษรที่อยู่ในแต่ละคำ มาดูตัวอย่างง่ายๆกัน เราจะสมมติว่าคลังข้อมูลของเรามีเพียงห้าคำเท่านั้น :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">&quot;hug&quot;</span>, <span class="hljs-string">&quot;pug&quot;</span>, <span class="hljs-string">&quot;pun&quot;</span>, <span class="hljs-string">&quot;bun&quot;</span>, <span class="hljs-string">&quot;hugs&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mi54up">vocabulary ตั้งต้นสำหรับชุดข้อมูลนี้คือ <code>[&quot;b&quot;, &quot;g&quot;, &quot;h&quot;, &quot;n&quot;, &quot;p&quot;, &quot;s&quot;, &quot;u&quot;]</code> ในการใช้งานจริง vocabulary ตั้งต้นจะประกอบด้วยตัวอักษร ASCII เป็นอย่างต่ำ หรืออาจจะมีตัวอักษร Unicode ได้ด้วย</p> <p data-svelte-h="svelte-568bhx">ถ้าข้อความใหม่ที่คุณต้องการจะตัดคำ มีสัญลักษณ์ที่ไม่ได้อยู่ใน training corpus สัญลักษณ์พวกนี้จะถูกแปลงเป็น unknown token นี่เป็นเหตุผลว่าทำไมโมเดล NLP จึงประมวลผลข้อความที่มีอีโมจิได้ไม่ดีนัก</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1vpq7rj">Tokenizer ของ GPT-2 และ RoBERTa (ซึ่งค่อนข้างคล้ายกัน) มีวิธีการจัดการกับปัญหานี้ได้อย่างประสิทธิภาพ มันจะไม่มองแต่ละคำเป็น Unicode แต่จะมองว่าเป็น byte การทำแบบนี้ทำให้ vocabulary ตั้งต้น มีขนาดที่เล็ก (256) แต่ยังสามารถบันทึกทุกๆสัญลักษณ์ได้ โดยไม่ต้องแปลงสัญลักษณ์พิเศษต่างๆเป็น unknown token เทคนิคนี้เรียกว่า <em>byte-level BPE</em></p></div> <p data-svelte-h="svelte-19vck82">หลังจากสร้าง vocabulary ตั้งต้นแล้ว เราจะเพิ่ม token ใหม่ๆ เข้าไปจนว่าจะได้ vocabulary ขนาดใหญ่พอกับที่เราต้องการ โดยเราจะเทรน BPE ให้เรียน กฎที่เรียกว่า <em>merges</em> ซึ่งเป็นกฎสำหรับการรวมสองหน่วยใน vocabulary เข้าด้วยกัน
ตอนช่วงเริ่มต้น กฎ merges พวกนี้จะสร้างคำย่อยที่ประกอบด้วยตัวอักษรสองตัว ระหว่างที่เราเทรนต่อไปเรื่อยๆ คำย่อยที่ได้ก็จะยาวขึ้น
ในแต่ละรอบของการเทรน BPE จะคำนวณหาคู่ของคำย่อยที่พบบ่อยที่สุด (คู่ของคำย่อย ในที่นี้เราหมายถึง token ที่อยู่ติดกัน)
คู่ที่มีความถี่มากที่สุดจะถูกรวมเข้าด้วยกัน จากนั้นโมเดลจะทำแบบเดิมอีกในการเทรนรอบต่อไป</p> <p data-svelte-h="svelte-qclq5r">กลับมาที่ตัวอย่างของเรา สมมติว่าแต่ละคำมีความถี่ดังต่อไปนี้ :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">&quot;hug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;bun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;hugs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1hblxvc">ซึ่งแปลว่า <code>&quot;hug&quot;</code> พบ 10 ครั้งใน corpus, <code>&quot;pug&quot;</code> พบ 5 ครั้ง, <code>&quot;pun&quot;</code> พบ 12 ครั้ง, <code>&quot;bun&quot;</code> พบ 4 ครั้ง, และ <code>&quot;hugs&quot;</code> พบ 5 ครั้ง</p> <p data-svelte-h="svelte-14fxlqe">เราจะเริ่มการเทรน โดยแยกแต่ละคำออกเป็นตัวอักษร (ตัวอักษรจะต้องมาจาก vocabulary ตั้งต้นที่เราสร้างมาก่อนหน้านี้แล้ว) ตอนนี้คุณจะเห็นว่าแต่ละคำถูกแปลงเป็น list ที่ประกอบไปด้วยหลายๆ token</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">&quot;h&quot;</span> <span class="hljs-string">&quot;u&quot;</span> <span class="hljs-string">&quot;g&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;u&quot;</span> <span class="hljs-string">&quot;g&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;u&quot;</span> <span class="hljs-string">&quot;n&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;b&quot;</span> <span class="hljs-string">&quot;u&quot;</span> <span class="hljs-string">&quot;n&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;h&quot;</span> <span class="hljs-string">&quot;u&quot;</span> <span class="hljs-string">&quot;g&quot;</span> <span class="hljs-string">&quot;s&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-bmix0l">จากนั้น เราจะดูทีละคู่ ตัวอย่างเช่น คู่ <code>(&quot;h&quot;, &quot;u&quot;)</code> ซึ่งมาจากคำว่า <code>&quot;hug&quot;</code> และ <code>&quot;hugs&quot;</code> และพบ 15 ครั้งใน corpus
อย่างไรก็ตาม คู่นี้ไม่ใช่คู่ที่พบบ่อยที่สุด คู่ที่พบบ่อยที่สุดคือ <code>(&quot;u&quot;, &quot;g&quot;)</code> ซึ่งพบใน คำว่า <code>&quot;hug&quot;</code>, <code>&quot;pug&quot;</code>, และ <code>&quot;hugs&quot;</code> ซึ่งความถี่รวมของมันคือ 20 ครั้ง
ดังนั้น กฎแรกของการ merge ที่ tokenizer เรียนคือ <code>(&quot;u&quot;, &quot;g&quot;) -&gt; &quot;ug&quot;</code> แปลว่ามันจะเพิ่ม <code>&quot;ug&quot;</code> เข้าไปใน vocabulary และใน corpus คู่นี้ก็จะถูกรวมเป็น token เดียวด้วย</p> <p data-svelte-h="svelte-7b8c15">หลังจากขั้นตอนนี้ vocabulary และ corpus จะมีค่าดังนี้ :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-symbol">Vocabulary:</span> [<span class="hljs-string">&quot;b&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>, <span class="hljs-string">&quot;h&quot;</span>, <span class="hljs-string">&quot;n&quot;</span>, <span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;s&quot;</span>, <span class="hljs-string">&quot;u&quot;</span>, <span class="hljs-string">&quot;ug&quot;</span>]
<span class="hljs-symbol">Corpus:</span> (<span class="hljs-string">&quot;h&quot;</span> <span class="hljs-string">&quot;ug&quot;</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;ug&quot;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;u&quot;</span> <span class="hljs-string">&quot;n&quot;</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">&quot;b&quot;</span> <span class="hljs-string">&quot;u&quot;</span> <span class="hljs-string">&quot;n&quot;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&quot;h&quot;</span> <span class="hljs-string">&quot;ug&quot;</span> <span class="hljs-string">&quot;s&quot;</span>, <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1bqhc5z">ตอนนี้ จะเห็นว่าเรามีคู่ที่เมื่อรวมกันจะได้ token ที่ยาวกว่าสองตัวอักษร ตัวอย่างเช่น คู่ <code>(&quot;h&quot;, &quot;ug&quot;)</code> ซึ่งพบ 15 ครั้งใน corpus
อย่างไรก็ตาม คู่ที่พบบ่อยที่สุดคือ <code>(&quot;u&quot;, &quot;n&quot;)</code> ซึ่งพบ 16 ครั้ง ดังนั้นกฎที่สองก็คือ <code>(&quot;u&quot;, &quot;n&quot;) -&gt; &quot;un&quot;</code> หลังจากที่เราเพิ่มคู่นี้ไปใน vocabulary และ merge token ใน corpus เข้าด้วยกันแล้ว เราจะได้ :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-symbol">Vocabulary:</span> [<span class="hljs-string">&quot;b&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>, <span class="hljs-string">&quot;h&quot;</span>, <span class="hljs-string">&quot;n&quot;</span>, <span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;s&quot;</span>, <span class="hljs-string">&quot;u&quot;</span>, <span class="hljs-string">&quot;ug&quot;</span>, <span class="hljs-string">&quot;un&quot;</span>]
<span class="hljs-symbol">Corpus:</span> (<span class="hljs-string">&quot;h&quot;</span> <span class="hljs-string">&quot;ug&quot;</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;ug&quot;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;un&quot;</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">&quot;b&quot;</span> <span class="hljs-string">&quot;un&quot;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&quot;h&quot;</span> <span class="hljs-string">&quot;ug&quot;</span> <span class="hljs-string">&quot;s&quot;</span>, <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1g4lab1">ตอนนี้ คู่ที่พบบ่อยที่สุดคือ <code>(&quot;h&quot;, &quot;ug&quot;)</code> ดังนั้นกฎที่ได้คือ <code>(&quot;h&quot;, &quot;ug&quot;) -&gt; &quot;hug&quot;</code> ซึ่งจะทำให้เราได้ token ที่มีสามตัวอักษร หลังจากการ merge เราจะได้ :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-symbol">Vocabulary:</span> [<span class="hljs-string">&quot;b&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>, <span class="hljs-string">&quot;h&quot;</span>, <span class="hljs-string">&quot;n&quot;</span>, <span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;s&quot;</span>, <span class="hljs-string">&quot;u&quot;</span>, <span class="hljs-string">&quot;ug&quot;</span>, <span class="hljs-string">&quot;un&quot;</span>, <span class="hljs-string">&quot;hug&quot;</span>]
<span class="hljs-symbol">Corpus:</span> (<span class="hljs-string">&quot;hug&quot;</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;ug&quot;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;un&quot;</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">&quot;b&quot;</span> <span class="hljs-string">&quot;un&quot;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&quot;hug&quot;</span> <span class="hljs-string">&quot;s&quot;</span>, <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-rt06o2">เราจะทำแบบนี้ต่อไปเรื่อยๆ จนกว่าจะได้ขนาดของ vocabulary ที่ต้องการ</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-ms4toc">✏️ <strong>ตาคุณแล้ว!</strong> คุณคิดว่ากฎ merge ต่อไปคืออะไร</p></div> <h2 class="relative group"><a id="tokenization-algorithm" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tokenization-algorithm"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Tokenization algorithm</span></h2> <p data-svelte-h="svelte-38g5f4">การ tokenization เป็นขั้นตอนหลังจากการเทรน โดย input ใหม่จะถูก tokenize ด้วยขั้นตอนดังต่อไปนี้</p> <ol data-svelte-h="svelte-1dn790a"><li>Normalization (การปรับข้อความให้เป็นมาตรฐาน)</li> <li>Pre-tokenization (การเตรียมข้อความให้พร้อมสำหรับการ tokenize จริง)</li> <li>แยกคำออกเป็นตัวอักษรเดี่ยว</li> <li>ใช้กฎ merge ที่ได้จากการเทรนเพื่อรวมตัวอักษรที่เราได้จากขั้นตอนก่อนหน้า</li></ol> <p data-svelte-h="svelte-1c2vghc">มาดูกฎสามตัวที่เราได้จากการเทรนก่อนหน้านี้ :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-function"><span class="hljs-params">(<span class="hljs-string">&quot;u&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>)</span> -&gt;</span> <span class="hljs-string">&quot;ug&quot;</span>
<span class="hljs-function"><span class="hljs-params">(<span class="hljs-string">&quot;u&quot;</span>, <span class="hljs-string">&quot;n&quot;</span>)</span> -&gt;</span> <span class="hljs-string">&quot;un&quot;</span>
<span class="hljs-function"><span class="hljs-params">(<span class="hljs-string">&quot;h&quot;</span>, <span class="hljs-string">&quot;ug&quot;</span>)</span> -&gt;</span> <span class="hljs-string">&quot;hug&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-n933xw">คำว่า<code>&quot;bug&quot;</code> จะถูกแยกเป็น <code>[&quot;b&quot;, &quot;ug&quot;]</code> ส่วนคำว่า <code>&quot;mug&quot;</code> จะถูกแยกเป็น <code>[&quot;[UNK]&quot;, &quot;ug&quot;]</code> เพราะว่า <code>&quot;m&quot;</code> ไม่ได้อยู่ใน vocabulary ของเรา
้เช่นเดียวกัน คำว่า <code>&quot;thug&quot;</code> จะถูกแยกเป็น <code>[&quot;[UNK]&quot;, &quot;hug&quot;]</code> เพราะว่า <code>&quot;t&quot;</code> ไม่ได้อยู่ใน vocabulary กฎแรกจะรวม <code>&quot;u&quot;</code> และ <code>&quot;g&quot;</code> เข้าด้วยกัน จากนั้น <code>&quot;hu&quot;</code> และ <code>&quot;g&quot;</code> ก็จะถูกรวมเข้าด้วยกัน</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-13f5w1r">✏️ <strong>ตาคุณแล้ว!</strong> คุณคิดว่าคำว่า <code>&quot;unhug&quot;</code> จะถูกแยกอย่างไร</p></div> <h2 class="relative group"><a id="การสราง-bpe-implementing-bpe" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#การสราง-bpe-implementing-bpe"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>การสร้าง BPE (Implementing BPE)</span></h2> <p data-svelte-h="svelte-1e93cpt">ตอนนี้เราจะมาดูกันว่า คุณจะสามารถ implement อัลกอริทึม BPE ได้อย่างไร สิ่งที่เราจะเรียนต่อไปนี้ไม่ใช่ implementation ที่ดีที่สุด เราเพียงต้องการให้คุณเข้าใจโค้ดและเข้าใจว่า BPE ทำงานอย่างไร</p> <p data-svelte-h="svelte-5w2uhv">อันดับแรก เราต้องการ corpus ดังนั้น เราจะสร้าง corpus แบบง่ายๆขึ้นมา โดยประกอบไปด้วยไม่กี่ประโยค :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->corpus = [
<span class="hljs-string">&quot;This is the Hugging Face course.&quot;</span>,
<span class="hljs-string">&quot;This chapter is about tokenization.&quot;</span>,
<span class="hljs-string">&quot;This section shows several tokenizer algorithms.&quot;</span>,
<span class="hljs-string">&quot;Hopefully, you will be able to understand how they are trained and generate tokens.&quot;</span>,
]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1p977ot">จากนั้นเราจะทำการ pre-tokenize corpus นี้ เพื่อแยกข้อความออกเป็นคำๆ เนื่องจากเราจะสร้าง BPE tokenizer ตามตัวที่ใช้ใน GPT-2 เราจึงต้องใช้ <code>gpt2</code> tokenizer ในการ pre-tokenize</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;gpt2&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-97lq7n">จากนั้นเราจะคำนวณความถี่ของแต่ละคำ:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> collections <span class="hljs-keyword">import</span> defaultdict
word_freqs = defaultdict(<span class="hljs-built_in">int</span>)
<span class="hljs-keyword">for</span> text <span class="hljs-keyword">in</span> corpus:
words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
new_words = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> words_with_offsets]
<span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> new_words:
word_freqs[word] += <span class="hljs-number">1</span>
<span class="hljs-built_in">print</span>(word_freqs)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->defaultdict(<span class="hljs-built_in">int</span>, {<span class="hljs-string">&#x27;This&#x27;</span>: <span class="hljs-number">3</span>, <span class="hljs-string">&#x27;Ġis&#x27;</span>: <span class="hljs-number">2</span>, <span class="hljs-string">&#x27;Ġthe&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;ĠHugging&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;ĠFace&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;ĠCourse&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;.&#x27;</span>: <span class="hljs-number">4</span>, <span class="hljs-string">&#x27;Ġchapter&#x27;</span>: <span class="hljs-number">1</span>,
<span class="hljs-string">&#x27;Ġabout&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġtokenization&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġsection&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġshows&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġseveral&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġtokenizer&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġalgorithms&#x27;</span>: <span class="hljs-number">1</span>,
<span class="hljs-string">&#x27;Hopefully&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;,&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġyou&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġwill&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġbe&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġable&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġto&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġunderstand&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġhow&#x27;</span>: <span class="hljs-number">1</span>,
<span class="hljs-string">&#x27;Ġthey&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġare&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġtrained&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġand&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġgenerate&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Ġtokens&#x27;</span>: <span class="hljs-number">1</span>})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ki81xp">ขั้นตอนต่อไป คือการคำนวณ vocabulary ตั้งต้น ซึ่งสร้างจากแต่ละตัวอักษรใน corpus :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->alphabet = []
<span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs.keys():
<span class="hljs-keyword">for</span> letter <span class="hljs-keyword">in</span> word:
<span class="hljs-keyword">if</span> letter <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> alphabet:
alphabet.append(letter)
alphabet.sort()
<span class="hljs-built_in">print</span>(alphabet)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[ <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;C&#x27;</span>, <span class="hljs-string">&#x27;F&#x27;</span>, <span class="hljs-string">&#x27;H&#x27;</span>, <span class="hljs-string">&#x27;T&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;b&#x27;</span>, <span class="hljs-string">&#x27;c&#x27;</span>, <span class="hljs-string">&#x27;d&#x27;</span>, <span class="hljs-string">&#x27;e&#x27;</span>, <span class="hljs-string">&#x27;f&#x27;</span>, <span class="hljs-string">&#x27;g&#x27;</span>, <span class="hljs-string">&#x27;h&#x27;</span>, <span class="hljs-string">&#x27;i&#x27;</span>, <span class="hljs-string">&#x27;k&#x27;</span>, <span class="hljs-string">&#x27;l&#x27;</span>, <span class="hljs-string">&#x27;m&#x27;</span>, <span class="hljs-string">&#x27;n&#x27;</span>, <span class="hljs-string">&#x27;o&#x27;</span>, <span class="hljs-string">&#x27;p&#x27;</span>, <span class="hljs-string">&#x27;r&#x27;</span>, <span class="hljs-string">&#x27;s&#x27;</span>,
<span class="hljs-string">&#x27;t&#x27;</span>, <span class="hljs-string">&#x27;u&#x27;</span>, <span class="hljs-string">&#x27;v&#x27;</span>, <span class="hljs-string">&#x27;w&#x27;</span>, <span class="hljs-string">&#x27;y&#x27;</span>, <span class="hljs-string">&#x27;z&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ivy2j">เราจะเพิ่ม token พิเศษเข้าไปในข้างหน้า list นี้ด้วย GPT-2 ใช้ token พิเศษคือ <code>&quot;&lt;|endoftext|&gt;&quot;</code> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->vocab = [<span class="hljs-string">&quot;&lt;|endoftext|&gt;&quot;</span>] + alphabet.copy()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1d6a2sn">จากนั้นเราจะแยกแต่ละคำใน corpus ให้เป็นตัวอักษร เพื่อที่เราจะได้เริ่มการเทรน :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->splits = {word: [c <span class="hljs-keyword">for</span> c <span class="hljs-keyword">in</span> word] <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs.keys()}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1n9k9s1">ตอนนี้เราก็พร้อมที่จะเทรนแล้ว เราจะเริ่มด้วยการเขียนฟังก์ชันที่คำนวณความถี่ของแต่ละคู่ token :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_pair_freqs</span>(<span class="hljs-params">splits</span>):
pair_freqs = defaultdict(<span class="hljs-built_in">int</span>)
<span class="hljs-keyword">for</span> word, freq <span class="hljs-keyword">in</span> word_freqs.items():
split = splits[word]
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(split) == <span class="hljs-number">1</span>:
<span class="hljs-keyword">continue</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(split) - <span class="hljs-number">1</span>):
pair = (split[i], split[i + <span class="hljs-number">1</span>])
pair_freqs[pair] += freq
<span class="hljs-keyword">return</span> pair_freqs<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jpkvry">มาดูส่วนผลลัพธ์ (ซึ่งเป็น dictionary) กัน :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pair_freqs = compute_pair_freqs(splits)
<span class="hljs-keyword">for</span> i, key <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(pair_freqs.keys()):
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;<span class="hljs-subst">{key}</span>: <span class="hljs-subst">{pair_freqs[key]}</span>&quot;</span>)
<span class="hljs-keyword">if</span> i &gt;= <span class="hljs-number">5</span>:
<span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">&#x27;T&#x27;</span>, <span class="hljs-string">&#x27;h&#x27;</span>): <span class="hljs-number">3</span>
(<span class="hljs-string">&#x27;h&#x27;</span>, <span class="hljs-string">&#x27;i&#x27;</span>): <span class="hljs-number">3</span>
(<span class="hljs-string">&#x27;i&#x27;</span>, <span class="hljs-string">&#x27;s&#x27;</span>): <span class="hljs-number">5</span>
(<span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;i&#x27;</span>): <span class="hljs-number">2</span>
(<span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;t&#x27;</span>): <span class="hljs-number">7</span>
(<span class="hljs-string">&#x27;t&#x27;</span>, <span class="hljs-string">&#x27;h&#x27;</span>): <span class="hljs-number">3</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ytfm66">จากนั้นเราจะหาคู่ที่พบบ่อยที่สุด ซึ่งทำได้ง่ายๆดังนี้ :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->best_pair = <span class="hljs-string">&quot;&quot;</span>
max_freq = <span class="hljs-literal">None</span>
<span class="hljs-keyword">for</span> pair, freq <span class="hljs-keyword">in</span> pair_freqs.items():
<span class="hljs-keyword">if</span> max_freq <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">or</span> max_freq &lt; freq:
best_pair = pair
max_freq = freq
<span class="hljs-built_in">print</span>(best_pair, max_freq)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;t&#x27;</span>) <span class="hljs-number">7</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-knui6i">ดังนั้น กฎแรกที่เราได้ก็คือ <code>(&#39;Ġ&#39;, &#39;t&#39;) -&gt; &#39;Ġt&#39;</code> และเราจะต้องเพิ่ม <code>&#39;Ġt&#39;</code> เข้าไปใน vocabulary :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->merges = {(<span class="hljs-string">&quot;Ġ&quot;</span>, <span class="hljs-string">&quot;t&quot;</span>): <span class="hljs-string">&quot;Ġt&quot;</span>}
vocab.append(<span class="hljs-string">&quot;Ġt&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1gz885v">จากนั้น เราจะต้องทำการ merge คำย่อยที่อยู่ใน dictionary <code>splits</code> ด้วย โดยเราจะเขียนฟังก์ชันต่อไปนี้ :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">merge_pair</span>(<span class="hljs-params">a, b, splits</span>):
<span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs:
split = splits[word]
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(split) == <span class="hljs-number">1</span>:
<span class="hljs-keyword">continue</span>
i = <span class="hljs-number">0</span>
<span class="hljs-keyword">while</span> i &lt; <span class="hljs-built_in">len</span>(split) - <span class="hljs-number">1</span>:
<span class="hljs-keyword">if</span> split[i] == a <span class="hljs-keyword">and</span> split[i + <span class="hljs-number">1</span>] == b:
split = split[:i] + [a + b] + split[i + <span class="hljs-number">2</span> :]
<span class="hljs-keyword">else</span>:
i += <span class="hljs-number">1</span>
splits[word] = split
<span class="hljs-keyword">return</span> splits<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-k9g317">และนี่ก็คือผลลัพธ์จากการ merge ครั้งแรก :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->splits = merge_pair(<span class="hljs-string">&quot;Ġ&quot;</span>, <span class="hljs-string">&quot;t&quot;</span>, splits)
<span class="hljs-built_in">print</span>(splits[<span class="hljs-string">&quot;Ġtrained&quot;</span>])<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;Ġt&#x27;</span>, <span class="hljs-string">&#x27;r&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;i&#x27;</span>, <span class="hljs-string">&#x27;n&#x27;</span>, <span class="hljs-string">&#x27;e&#x27;</span>, <span class="hljs-string">&#x27;d&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ch2nw4">ตอนนี้เราก็มีทุกอย่างพร้อมสำหรับการเทรนแล้ว เราจะเทรนจนกว่าขนาดของ vocabulary จะเท่ากับ 50 :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->vocab_size = <span class="hljs-number">50</span>
<span class="hljs-keyword">while</span> <span class="hljs-built_in">len</span>(vocab) &lt; vocab_size:
pair_freqs = compute_pair_freqs(splits)
best_pair = <span class="hljs-string">&quot;&quot;</span>
max_freq = <span class="hljs-literal">None</span>
<span class="hljs-keyword">for</span> pair, freq <span class="hljs-keyword">in</span> pair_freqs.items():
<span class="hljs-keyword">if</span> max_freq <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">or</span> max_freq &lt; freq:
best_pair = pair
max_freq = freq
splits = merge_pair(*best_pair, splits)
merges[best_pair] = best_pair[<span class="hljs-number">0</span>] + best_pair[<span class="hljs-number">1</span>]
vocab.append(best_pair[<span class="hljs-number">0</span>] + best_pair[<span class="hljs-number">1</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1047qda">ผลลัพธ์ที่ได้คือ tokenizer ของเราได้เรียน 19 กฎ (vocabulary ตั้งต้นมี 31 token ซึ่งมาจากตัวอักษรที่เรามี 30 ตัวและ token พิเศษอีกหนึ่งตัว) :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(merges)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{(<span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;t&#x27;</span>): <span class="hljs-string">&#x27;Ġt&#x27;</span>, (<span class="hljs-string">&#x27;i&#x27;</span>, <span class="hljs-string">&#x27;s&#x27;</span>): <span class="hljs-string">&#x27;is&#x27;</span>, (<span class="hljs-string">&#x27;e&#x27;</span>, <span class="hljs-string">&#x27;r&#x27;</span>): <span class="hljs-string">&#x27;er&#x27;</span>, (<span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>): <span class="hljs-string">&#x27;Ġa&#x27;</span>, (<span class="hljs-string">&#x27;Ġt&#x27;</span>, <span class="hljs-string">&#x27;o&#x27;</span>): <span class="hljs-string">&#x27;Ġto&#x27;</span>, (<span class="hljs-string">&#x27;e&#x27;</span>, <span class="hljs-string">&#x27;n&#x27;</span>): <span class="hljs-string">&#x27;en&#x27;</span>,
(<span class="hljs-string">&#x27;T&#x27;</span>, <span class="hljs-string">&#x27;h&#x27;</span>): <span class="hljs-string">&#x27;Th&#x27;</span>, (<span class="hljs-string">&#x27;Th&#x27;</span>, <span class="hljs-string">&#x27;is&#x27;</span>): <span class="hljs-string">&#x27;This&#x27;</span>, (<span class="hljs-string">&#x27;o&#x27;</span>, <span class="hljs-string">&#x27;u&#x27;</span>): <span class="hljs-string">&#x27;ou&#x27;</span>, (<span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;e&#x27;</span>): <span class="hljs-string">&#x27;se&#x27;</span>, (<span class="hljs-string">&#x27;Ġto&#x27;</span>, <span class="hljs-string">&#x27;k&#x27;</span>): <span class="hljs-string">&#x27;Ġtok&#x27;</span>,
(<span class="hljs-string">&#x27;Ġtok&#x27;</span>, <span class="hljs-string">&#x27;en&#x27;</span>): <span class="hljs-string">&#x27;Ġtoken&#x27;</span>, (<span class="hljs-string">&#x27;n&#x27;</span>, <span class="hljs-string">&#x27;d&#x27;</span>): <span class="hljs-string">&#x27;nd&#x27;</span>, (<span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;is&#x27;</span>): <span class="hljs-string">&#x27;Ġis&#x27;</span>, (<span class="hljs-string">&#x27;Ġt&#x27;</span>, <span class="hljs-string">&#x27;h&#x27;</span>): <span class="hljs-string">&#x27;Ġth&#x27;</span>, (<span class="hljs-string">&#x27;Ġth&#x27;</span>, <span class="hljs-string">&#x27;e&#x27;</span>): <span class="hljs-string">&#x27;Ġthe&#x27;</span>,
(<span class="hljs-string">&#x27;i&#x27;</span>, <span class="hljs-string">&#x27;n&#x27;</span>): <span class="hljs-string">&#x27;in&#x27;</span>, (<span class="hljs-string">&#x27;Ġa&#x27;</span>, <span class="hljs-string">&#x27;b&#x27;</span>): <span class="hljs-string">&#x27;Ġab&#x27;</span>, (<span class="hljs-string">&#x27;Ġtoken&#x27;</span>, <span class="hljs-string">&#x27;i&#x27;</span>): <span class="hljs-string">&#x27;Ġtokeni&#x27;</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-19ymfzy">ส่วน vocabulary ที่ได้จะประกอบไปด้วย token พิเศษ, ตัวอักษรตั้งต้น, และผลลัพธ์จากการ merge แต่ละครั้ง :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(vocab)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;&lt;|endoftext|&gt;&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;C&#x27;</span>, <span class="hljs-string">&#x27;F&#x27;</span>, <span class="hljs-string">&#x27;H&#x27;</span>, <span class="hljs-string">&#x27;T&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;b&#x27;</span>, <span class="hljs-string">&#x27;c&#x27;</span>, <span class="hljs-string">&#x27;d&#x27;</span>, <span class="hljs-string">&#x27;e&#x27;</span>, <span class="hljs-string">&#x27;f&#x27;</span>, <span class="hljs-string">&#x27;g&#x27;</span>, <span class="hljs-string">&#x27;h&#x27;</span>, <span class="hljs-string">&#x27;i&#x27;</span>, <span class="hljs-string">&#x27;k&#x27;</span>, <span class="hljs-string">&#x27;l&#x27;</span>, <span class="hljs-string">&#x27;m&#x27;</span>, <span class="hljs-string">&#x27;n&#x27;</span>, <span class="hljs-string">&#x27;o&#x27;</span>,
<span class="hljs-string">&#x27;p&#x27;</span>, <span class="hljs-string">&#x27;r&#x27;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;t&#x27;</span>, <span class="hljs-string">&#x27;u&#x27;</span>, <span class="hljs-string">&#x27;v&#x27;</span>, <span class="hljs-string">&#x27;w&#x27;</span>, <span class="hljs-string">&#x27;y&#x27;</span>, <span class="hljs-string">&#x27;z&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;Ġt&#x27;</span>, <span class="hljs-string">&#x27;is&#x27;</span>, <span class="hljs-string">&#x27;er&#x27;</span>, <span class="hljs-string">&#x27;Ġa&#x27;</span>, <span class="hljs-string">&#x27;Ġto&#x27;</span>, <span class="hljs-string">&#x27;en&#x27;</span>, <span class="hljs-string">&#x27;Th&#x27;</span>, <span class="hljs-string">&#x27;This&#x27;</span>, <span class="hljs-string">&#x27;ou&#x27;</span>, <span class="hljs-string">&#x27;se&#x27;</span>,
<span class="hljs-string">&#x27;Ġtok&#x27;</span>, <span class="hljs-string">&#x27;Ġtoken&#x27;</span>, <span class="hljs-string">&#x27;nd&#x27;</span>, <span class="hljs-string">&#x27;Ġis&#x27;</span>, <span class="hljs-string">&#x27;Ġth&#x27;</span>, <span class="hljs-string">&#x27;Ġthe&#x27;</span>, <span class="hljs-string">&#x27;in&#x27;</span>, <span class="hljs-string">&#x27;Ġab&#x27;</span>, <span class="hljs-string">&#x27;Ġtokeni&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-7ebhk9">💡 ถ้าคุณใช้ <code>train_new_from_iterator()</code> กับ corpus เดียวกันนี้ คุณจะไม่ได้ vocabulary เดียวกัน เพราะว่าอาจจะมีหลายคู่ token ที่มีความถี่สูงสุดเท่ากัน ในตัวอย่างของเรา เราเลือกคู่แรกที่โค้ดของเราอ่านเจอ ส่วน 🤗 Tokenizers library เลือกคู่แรกโดยเรียงตาม ID</p></div> <p data-svelte-h="svelte-1g50xxw">หากเราต้องการ tokenize ข้อความใดข้อความหนึ่ง สิ่งที่ต้องทำคือ pre-tokenize จากนั้นจึงทำการ tokenize และสุดท้าย apply กฎ merge :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">tokenize</span>(<span class="hljs-params">text</span>):
pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
pre_tokenized_text = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> pre_tokenize_result]
splits = [[l <span class="hljs-keyword">for</span> l <span class="hljs-keyword">in</span> word] <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> pre_tokenized_text]
<span class="hljs-keyword">for</span> pair, merge <span class="hljs-keyword">in</span> merges.items():
<span class="hljs-keyword">for</span> idx, split <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(splits):
i = <span class="hljs-number">0</span>
<span class="hljs-keyword">while</span> i &lt; <span class="hljs-built_in">len</span>(split) - <span class="hljs-number">1</span>:
<span class="hljs-keyword">if</span> split[i] == pair[<span class="hljs-number">0</span>] <span class="hljs-keyword">and</span> split[i + <span class="hljs-number">1</span>] == pair[<span class="hljs-number">1</span>]:
split = split[:i] + [merge] + split[i + <span class="hljs-number">2</span> :]
<span class="hljs-keyword">else</span>:
i += <span class="hljs-number">1</span>
splits[idx] = split
<span class="hljs-keyword">return</span> <span class="hljs-built_in">sum</span>(splits, [])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xxndsq">คุณสามารถทดลองโค้ดนี้ได้กับข้อความทุกข้อความ ที่ประกอบไปด้วยตัวอักษรเท่านั้น :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenize(<span class="hljs-string">&quot;This is not a token.&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;This&#x27;</span>, <span class="hljs-string">&#x27;Ġis&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;n&#x27;</span>, <span class="hljs-string">&#x27;o&#x27;</span>, <span class="hljs-string">&#x27;t&#x27;</span>, <span class="hljs-string">&#x27;Ġa&#x27;</span>, <span class="hljs-string">&#x27;Ġtoken&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-qgvtsz">⚠️ การ implementation ในตัวอย่างของเราจะ return error ถ้าโปรแกรมอ่านเจอตัวอักษรที่ไม่มีใน vocabulary นั่นเพราะว่าเราไม่ได้เขียนโค้ดเพื่อจัดการกับกรณีแบบนี้
ใน GPT-2 ปกติจะไม่มี unknown token แบบนี้ เพราะว่า ถ้าเราใช้ byte-level BPE เราจะไม่มีทางได้ตัวอักษรที่ unknown อย่างไรก็ตามในตัวอย่างของเรา เราไม่ได้ใช้ทุกๆ byte เพื่อสร้าง vocabulary ตั้งต้น
อย่างไรก็ตาม หัวข้อนี้นั้นค่อนข้างลึก เราจึงจะไม่ขอพูดถึงรายละเอียดไปมากกว่านี้</p></div> <p data-svelte-h="svelte-zewf2t">นี่ก็คือ อัลกอริทึม BPE ในบทต่อไป เราจะมาดู WordPiece กัน</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/th/chapter6/5.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1q1y86z = {
assets: "/docs/course/pr_1069/th",
base: "/docs/course/pr_1069/th",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1069/th/_app/immutable/entry/start.eeb02c13.js"),
import("/docs/course/pr_1069/th/_app/immutable/entry/app.25e78aa9.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 40],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
97.1 kB
·
Xet hash:
8446466d9fa279bb84ede00edbb6b863fc0fcdf21c437c073213554377713ce0

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.