Buckets:

rtrm's picture
download
raw
38.8 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Ce fac dacă dataset-ul meu nu este pe Hub?&quot;,&quot;local&quot;:&quot;what-if-my-dataset-isnt-on-the-hub&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Lucrând cu dataseturi locale și remote&quot;,&quot;local&quot;:&quot;working-with-local-and-remote-datasets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Încărcarea unui dataset local&quot;,&quot;local&quot;:&quot;loading-a-local-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Încărcarea unui dataset remote&quot;,&quot;local&quot;:&quot;loading-a-remote-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1069/rum/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/entry/start.1de7c3d2.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/scheduler.37c15a92.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/singletons.e13b7dfd.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/index.18351ede.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/paths.e130b7b0.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/entry/app.1f82014c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/index.2bf4358c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/nodes/0.3c83e1ab.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/nodes/58.8a2e68dc.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/Tip.363c041f.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/Youtube.1e50a667.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/CodeBlock.4e987730.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/CourseFloatingBanner.6add7356.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/getInferenceSnippets.24b50994.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Ce fac dacă dataset-ul meu nu este pe Hub?&quot;,&quot;local&quot;:&quot;what-if-my-dataset-isnt-on-the-hub&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Lucrând cu dataseturi locale și remote&quot;,&quot;local&quot;:&quot;working-with-local-and-remote-datasets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Încărcarea unui dataset local&quot;,&quot;local&quot;:&quot;loading-a-local-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Încărcarea unui dataset remote&quot;,&quot;local&quot;:&quot;loading-a-remote-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="what-if-my-dataset-isnt-on-the-hub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-if-my-dataset-isnt-on-the-hub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Ce fac dacă dataset-ul meu nu este pe Hub?</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-5-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter5/section2.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter5/section2.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-f7j29s">Știți cum să folosiți <a href="https://huggingface.co/datasets" rel="nofollow">Hugging Face Hub</a> pentru a descărca dataseturi, dar vă veți găsi adesea lucrând cu date care sunt stocate fie pe laptopul dumneavoastră, fie pe un server remote. În această secțiune vă vom arăta cum poate fi utilizat 🤗 Datasets pentru a încărca dataseturi care nu sunt disponibile pe Hugging Face Hub.</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/HyQgpJTkRdE" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <h2 class="relative group"><a id="working-with-local-and-remote-datasets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#working-with-local-and-remote-datasets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Lucrând cu dataseturi locale și remote</span></h2> <p data-svelte-h="svelte-ct0s01">🤗 Datasets oferă scripturi de încărcare pentru a gestiona încărcarea dataseturilor locale și remote. Suportă mai multe formate de date comune, cum ar fi:</p> <table data-svelte-h="svelte-19mvbg5"><thead><tr><th align="center">Data format</th> <th align="center">Loading script</th> <th align="center">Example</th></tr></thead> <tbody><tr><td align="center">CSV &amp; TSV</td> <td align="center"><code>csv</code></td> <td align="center"><code>load_dataset(&quot;csv&quot;, data_files=&quot;my_file.csv&quot;)</code></td></tr> <tr><td align="center">Text files</td> <td align="center"><code>text</code></td> <td align="center"><code>load_dataset(&quot;text&quot;, data_files=&quot;my_file.txt&quot;)</code></td></tr> <tr><td align="center">JSON &amp; JSON Lines</td> <td align="center"><code>json</code></td> <td align="center"><code>load_dataset(&quot;json&quot;, data_files=&quot;my_file.jsonl&quot;)</code></td></tr> <tr><td align="center">Pickled DataFrames</td> <td align="center"><code>pandas</code></td> <td align="center"><code>load_dataset(&quot;pandas&quot;, data_files=&quot;my_dataframe.pkl&quot;)</code></td></tr></tbody></table> <p data-svelte-h="svelte-zzz8xh">Așa cum se arată în tabel, pentru fiecare format de date trebuie doar să specificăm tipul scriptului de încărcare în funcția <code>load_dataset()</code>, alături de un argument <code>data_files</code> care specifică calea către unul sau mai multe fișiere. Să începem prin încărcarea unui dataset din fișiere locale; mai târziu vom vedea cum să facem același lucru cu fișiere remote.</p> <h2 class="relative group"><a id="loading-a-local-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#loading-a-local-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Încărcarea unui dataset local</span></h2> <p data-svelte-h="svelte-1qma7rp">Pentru acest exemplu vom folosi <a href="https://github.com/crux82/squad-it/" rel="nofollow">datasetul SQuAD-it</a>, care este un dataset la scară largă pentru întrebări și răspunsuri în italiană.</p> <p data-svelte-h="svelte-1ln4coo">Seturile de antrenare și test sunt găzduite pe GitHub, deci le putem descărca cu o comandă simplă <code>wget</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1sb1luf">Aceasta va descărca două fișiere comprimate numite <em>SQuAD_it-train.json.gz</em> și <em>SQuAD_it-test.json.gz</em>, pe care le putem decomprima cu comanda Linux <code>gzip</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!gzip -dkv SQuAD_it-*.json.gz<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->SQuAD_it-test.json.gz: 87.4% -- replaced with SQuAD_it-test.json
SQuAD_it-train.json.gz: 82.2% -- replaced with SQuAD_it-train.json<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1kneawa">Putem vedea că fișierele comprimate au fost înlocuite cu <em>SQuAD_it-train.json</em> și <em>SQuAD_it-test.json</em>, și că datele sunt stocate în formatul JSON.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-401xui">✎ Dacă vă întrebați de ce există un caracter <code>!</code> în comenzile shell de mai sus, aceasta este pentru că le rulăm într-un notebook Jupyter. Pur și simplu eliminați prefixul dacă doriți să descărcați și să dezarhivați datasetul într-un terminal.</p></div> <p data-svelte-h="svelte-1ob0qeb">Pentru a încărca un fișier JSON cu funcția <code>load_dataset()</code>, trebuie doar să știm dacă avem de-a face cu JSON obișnuit (similar cu un dicționar imbricat) sau JSON Lines (JSON separat pe linii). Ca multe dataseturi de întrebări și răspunsuri, SQuAD-it folosește formatul imbricat, cu tot textul stocat într-un câmp <code>data</code>. Aceasta înseamnă că putem încărca datasetul specificând argumentul <code>field</code> după cum urmează:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
squad_it_dataset = load_dataset(<span class="hljs-string">&quot;json&quot;</span>, data_files=<span class="hljs-string">&quot;SQuAD_it-train.json&quot;</span>, field=<span class="hljs-string">&quot;data&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1sy0xym">În mod implicit, încărcarea fișierelor locale creează un obiect <code>DatasetDict</code> cu un split <code>train</code>. Putem vedea acest lucru prin inspectarea obiectului <code>squad_it_dataset</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->squad_it_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->DatasetDict({
train: Dataset({
features: [<span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;paragraphs&#x27;</span>],
num_rows: <span class="hljs-number">442</span>
})
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1a2594y">Aceasta ne arată numărul de rânduri și numele coloanelor asociate cu setul de antrenare. Putem vizualiza unul dintre exemple prin indexarea în splitul <code>train</code> după cum urmează:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->squad_it_dataset[<span class="hljs-string">&quot;train&quot;</span>][<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
<span class="hljs-string">&quot;title&quot;</span>: <span class="hljs-string">&quot;Terremoto del Sichuan del 2008&quot;</span>,
<span class="hljs-string">&quot;paragraphs&quot;</span>: [
{
<span class="hljs-string">&quot;context&quot;</span>: <span class="hljs-string">&quot;Il terremoto del Sichuan del 2008 o il terremoto...&quot;</span>,
<span class="hljs-string">&quot;qas&quot;</span>: [
{
<span class="hljs-string">&quot;answers&quot;</span>: [{<span class="hljs-string">&quot;answer_start&quot;</span>: <span class="hljs-number">29</span>, <span class="hljs-string">&quot;text&quot;</span>: <span class="hljs-string">&quot;2008&quot;</span>}],
<span class="hljs-string">&quot;id&quot;</span>: <span class="hljs-string">&quot;56cdca7862d2951400fa6826&quot;</span>,
<span class="hljs-string">&quot;question&quot;</span>: <span class="hljs-string">&quot;In quale anno si è verificato il terremoto nel Sichuan?&quot;</span>,
},
...
],
},
...
],
}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1axlpux">Excelent, am încărcat primul nostru dataset local! Dar în timp ce acest lucru a funcționat pentru setul de antrenare, ceea ce dorim cu adevărat este să includem atât spliturile <code>train</code> cât și <code>test</code> într-un singur obiect <code>DatasetDict</code> astfel încât să putem aplica funcțiile <code>Dataset.map()</code> pe ambele splituri deodată. Pentru a face acest lucru, putem furniza un dicționar argumentului <code>data_files</code> care mapează fiecare nume de split la un fișier asociat cu acel split:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->data_files = {<span class="hljs-string">&quot;train&quot;</span>: <span class="hljs-string">&quot;SQuAD_it-train.json&quot;</span>, <span class="hljs-string">&quot;test&quot;</span>: <span class="hljs-string">&quot;SQuAD_it-test.json&quot;</span>}
squad_it_dataset = load_dataset(<span class="hljs-string">&quot;json&quot;</span>, data_files=data_files, field=<span class="hljs-string">&quot;data&quot;</span>)
squad_it_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->DatasetDict({
train: Dataset({
features: [<span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;paragraphs&#x27;</span>],
num_rows: <span class="hljs-number">442</span>
})
test: Dataset({
features: [<span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;paragraphs&#x27;</span>],
num_rows: <span class="hljs-number">48</span>
})
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-127gck5">Acesta este exact ceea ce am dorit. Acum, putem aplica diverse tehnici de preprocesare pentru a curăța datele, tokeniza recenziile și așa mai departe.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-9otge6">Argumentul <code>data_files</code> al funcției <code>load_dataset()</code> este destul de flexibil și poate fi fie o singură cale de fișier, o listă de căi de fișiere, sau un dicționar care mapează numele spliturilor la căile fișierelor. De asemenea, puteți folosi glob pentru fișiere care se potrivesc unui model specificat conform regulilor folosite de shell-ul Unix (de exemplu, puteți face glob pentru toate fișierele JSON dintr-un director ca un singur split prin setarea <code>data_files=&quot;*.json&quot;</code>). Consultați <a href="https://huggingface.co/docs/datasets/loading#local-and-remote-files" rel="nofollow">documentația</a> 🤗 Datasets pentru mai multe detalii.</p></div> <p data-svelte-h="svelte-4rymuu">Scripturile de încărcare din 🤗 Datasets suportă de fapt decomprimarea automată a fișierelor de intrare, deci am fi putut să sărim peste folosirea <code>gzip</code> prin indicarea argumentului <code>data_files</code> direct către fișierele comprimate:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->data_files = {<span class="hljs-string">&quot;train&quot;</span>: <span class="hljs-string">&quot;SQuAD_it-train.json.gz&quot;</span>, <span class="hljs-string">&quot;test&quot;</span>: <span class="hljs-string">&quot;SQuAD_it-test.json.gz&quot;</span>}
squad_it_dataset = load_dataset(<span class="hljs-string">&quot;json&quot;</span>, data_files=data_files, field=<span class="hljs-string">&quot;data&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-p9ezl2">Acest lucru poate fi util dacă nu doriți să decomprimați manual multe fișiere GZIP. Decomprimarea automată se aplică și altor formate comune precum ZIP și TAR, deci trebuie doar să pointați <code>data_files</code> către fișierele comprimate și sunteți gata!</p> <p data-svelte-h="svelte-14qbgil">Acum că știți cum să încărcați fișiere locale pe laptopul sau desktop-ul dumneavoastră, să aruncăm o privire la încărcarea fișierelor remote.</p> <h2 class="relative group"><a id="loading-a-remote-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#loading-a-remote-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Încărcarea unui dataset remote</span></h2> <p data-svelte-h="svelte-1n3767y">Dacă lucrați ca data scientist sau programator într-o companie, există o șansă bună ca dataseturile pe care doriți să le analizați să fie stocate pe un server remote. Din fericire, încărcarea fișierelor remote este la fel de simplă ca încărcarea celor locale! În loc să furnizați o cale către fișiere locale, pointați argumentul <code>data_files</code> al <code>load_dataset()</code> către unul sau mai multe URL-uri unde sunt stocate fișierele remote. De exemplu, pentru datasetul SQuAD-it găzduit pe GitHub, putem pur și simplu să pointăm <code>data_files</code> către URL-urile <em>SQuAD_it-*.json.gz</em> după cum urmează:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->url = <span class="hljs-string">&quot;https://github.com/crux82/squad-it/raw/master/&quot;</span>
data_files = {
<span class="hljs-string">&quot;train&quot;</span>: url + <span class="hljs-string">&quot;SQuAD_it-train.json.gz&quot;</span>,
<span class="hljs-string">&quot;test&quot;</span>: url + <span class="hljs-string">&quot;SQuAD_it-test.json.gz&quot;</span>,
}
squad_it_dataset = load_dataset(<span class="hljs-string">&quot;json&quot;</span>, data_files=data_files, field=<span class="hljs-string">&quot;data&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-v2qyj2">Aceasta returnează același obiect <code>DatasetDict</code> obținut mai sus, dar ne economisește pasul de a descărca și decomprima manual fișierele <em>SQuAD_it-*.json.gz</em>. Aceasta încheie incursiunea noastră în diversele modalități de încărcare a dataseturilor care nu sunt găzduite pe Hugging Face Hub. Acum că avem un dataset cu care să ne jucăm, să explorăm diverse tehnici de manipulare a datelor!</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1hqket7">✏️ <strong>Încercați!</strong> Alegeți un alt dataset găzduit pe GitHub sau <a href="https://archive.ics.uci.edu/ml/index.php" rel="nofollow">UCI Machine Learning Repository</a> și încercați să îl încărcați atât local cât și remote folosind tehnicile introduse mai sus. Pentru puncte bonus, încercați să încărcați un dataset care este stocat în format CSV sau text (consultați <a href="https://huggingface.co/docs/datasets/loading#local-and-remote-files" rel="nofollow">documentația</a> pentru mai multe informații despre aceste formate).</p></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/rum/chapter5/2.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1ftlxhy = {
assets: "/docs/course/pr_1069/rum",
base: "/docs/course/pr_1069/rum",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1069/rum/_app/immutable/entry/start.1de7c3d2.js"),
import("/docs/course/pr_1069/rum/_app/immutable/entry/app.1f82014c.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 58],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
38.8 kB
·
Xet hash:
2290f5d808fa7ea21d2436335b15f2f2f6116465a32315eb7a60af0db8f5bc95

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.