Buckets:

rtrm's picture
download
raw
85.8 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Crearea propriului tău dataset&quot;,&quot;local&quot;:&quot;creating-your-own-dataset&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Obținerea datelor&quot;,&quot;local&quot;:&quot;getting-the-data&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Curățarea datelor&quot;,&quot;local&quot;:&quot;cleaning-up-the-data&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Îmbunătățirea datasetului&quot;,&quot;local&quot;:&quot;augmenting-the-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Încărcarea datasetului pe Hugging Face Hub&quot;,&quot;local&quot;:&quot;uploading-the-dataset-to-the-hugging-face-hub&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Crearea unei dataset card&quot;,&quot;local&quot;:&quot;creating-a-dataset-card&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1069/rum/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/entry/start.1de7c3d2.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/scheduler.37c15a92.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/singletons.e13b7dfd.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/index.18351ede.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/paths.e130b7b0.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/entry/app.1f82014c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/index.2bf4358c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/nodes/0.3c83e1ab.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/nodes/61.d1dff247.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/Tip.363c041f.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/Youtube.1e50a667.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/CodeBlock.4e987730.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/CourseFloatingBanner.6add7356.js">
<link rel="modulepreload" href="/docs/course/pr_1069/rum/_app/immutable/chunks/getInferenceSnippets.24b50994.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Crearea propriului tău dataset&quot;,&quot;local&quot;:&quot;creating-your-own-dataset&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Obținerea datelor&quot;,&quot;local&quot;:&quot;getting-the-data&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Curățarea datelor&quot;,&quot;local&quot;:&quot;cleaning-up-the-data&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Îmbunătățirea datasetului&quot;,&quot;local&quot;:&quot;augmenting-the-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Încărcarea datasetului pe Hugging Face Hub&quot;,&quot;local&quot;:&quot;uploading-the-dataset-to-the-hugging-face-hub&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Crearea unei dataset card&quot;,&quot;local&quot;:&quot;creating-a-dataset-card&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="creating-your-own-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#creating-your-own-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Crearea propriului tău dataset</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-5-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter5/section5.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter5/section5.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-q9med2">Uneori, datasetul necesar pentru a construi o aplicație NLP nu există, astfel încât veți trebui să-l creați singuri. În această secțiune vom arăta cum să creați un corpus de <a href="https://github.com/features/issues/" rel="nofollow">GitHub issues</a>, care sunt utilizate în mod obișnuit pentru a urmări erorile sau feature-urile din repositoriile GitHub. Acest corpus poate fi folosit pentru diverse scopuri, inclusiv:</p> <ul data-svelte-h="svelte-1g50u3y"><li>Explorarea timpului necesar pentru închiderea unor issues deschise sau pull requesturi</li> <li>Antrenarea unui <em>multilabel classifier</em> care poate eticheta issue-urile cu metadate pe baza descrierii issue-urilor (de exemplu, “bug”, “enhancement” sau “question”)</li> <li>Crearea unui motor de căutare semantică pentru a găsi care issues se potrivesc query-ului utilizatorului</li></ul> <p data-svelte-h="svelte-jqj8kl">În această secțiune ne vom focusa pe crearea corpusului, și în următoarea vom aborda aplicația motorului de căutare semantic. Pentru a păstra lucrurile meta, vom folosi issue-urile GitHub asociate cu un proiect open source popular: 🤗 Datasets! Să vedem cum să obținem datele și să explorăm informațiile conținute în aceste issue-uri.</p> <h2 class="relative group"><a id="getting-the-data" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#getting-the-data"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Obținerea datelor</span></h2> <p data-svelte-h="svelte-1tslfmk">Puteți găsi toate issue-urile din 🤗 Datasets navigând către tabul <a href="https://github.com/huggingface/datasets/issues" rel="nofollow">Issues</a> al repositorului. Așa cum arată următorul screenshot, la momentul scrierii acestui text existau 331 de issues deschise și 668 închise.</p> <div class="flex justify-center" data-svelte-h="svelte-1nfcdyp"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-issues.png" alt="Issue-urile GitHub asociate cu 🤗 Datasets." width="80%"></div> <p data-svelte-h="svelte-1v73hmj">Dacă ați da clic pe una dintre aceste issue-uri veți găsi că aceasta conține un titlu, o descriere și un set de labeluri care caracterizează issue-ul. Un exemplu este prezentat în screenshotul următor.</p> <div class="flex justify-center" data-svelte-h="svelte-1os1cjx"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-issues-single.png" alt="Un issue tipic în GitHub din repositoriul 🤗 Datasets." width="80%"></div> <p data-svelte-h="svelte-15bl581">Pentru a descărca toate issue-urile din repositoriu, vom folosi <a href="https://docs.github.com/en/rest" rel="nofollow">GitHub REST API</a> pentru a enumera <a href="https://docs.github.com/en/rest/reference/issues#list-repository-issues" rel="nofollow"><code>Issues</code> endpoint</a>. Aceast endpoint returnează o listă de obiecte JSON, cu fiecare obiect conținând un număr mare de câmpuri care includ titlul și descrierea precum și metadata despre starea issue-ului și așa mai departe.</p> <p data-svelte-h="svelte-fzac58">Un mod convenabil de descărcare a issue-urilor este prin utilizarea librăriei <code>requests</code>, care este modalitatea standard pentru a face cereri HTTP în Python. Puteți instala libraria rulând comanda:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!pip install requests<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ot26xa">Odată cu instalarea librariei, puteți face cereri GET la <code>Issues</code> endpoint prin invocarea funcției <code>requests.get()</code>. De exemplu, puteți rula următorul cod pentru a obține primul issue din prima pagină:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> requests
url = <span class="hljs-string">&quot;https://api.github.com/repos/huggingface/datasets/issues?page=1&amp;per_page=1&quot;</span>
response = requests.get(url)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7clxu8">Obiectul <code>response</code> conține o cantitate mare de informații utile despre requestul efectuat, inclusiv HTTP status code:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->response.status_code<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">200</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-s2gl5h">unde statusul <code>200</code> înseamnă că cererea a fost reușită (puteți găsi o listă completă de status coduri <a href="https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" rel="nofollow">aici</a>). De ceea ce suntem însă interesați este <em>payload</em>, care poate fi accesat în diverse formaturi precum bytes, string sau JSON. Deoarece știm că issue-urile noastre sunt în format JSON, să inspectăm payload-ul astfel:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->response.json()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792&#x27;</span>,
<span class="hljs-string">&#x27;repository_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets&#x27;</span>,
<span class="hljs-string">&#x27;labels_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792/labels{/name}&#x27;</span>,
<span class="hljs-string">&#x27;comments_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792/comments&#x27;</span>,
<span class="hljs-string">&#x27;events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792/events&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792&#x27;</span>,
<span class="hljs-string">&#x27;id&#x27;</span>: <span class="hljs-number">968650274</span>,
<span class="hljs-string">&#x27;node_id&#x27;</span>: <span class="hljs-string">&#x27;MDExOlB1bGxSZXF1ZXN0NzEwNzUyMjc0&#x27;</span>,
<span class="hljs-string">&#x27;number&#x27;</span>: <span class="hljs-number">2792</span>,
<span class="hljs-string">&#x27;title&#x27;</span>: <span class="hljs-string">&#x27;Update GooAQ&#x27;</span>,
<span class="hljs-string">&#x27;user&#x27;</span>: {<span class="hljs-string">&#x27;login&#x27;</span>: <span class="hljs-string">&#x27;bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;id&#x27;</span>: <span class="hljs-number">19718818</span>,
<span class="hljs-string">&#x27;node_id&#x27;</span>: <span class="hljs-string">&#x27;MDQ6VXNlcjE5NzE4ODE4&#x27;</span>,
<span class="hljs-string">&#x27;avatar_url&#x27;</span>: <span class="hljs-string">&#x27;https://avatars.githubusercontent.com/u/19718818?v=4&#x27;</span>,
<span class="hljs-string">&#x27;gravatar_id&#x27;</span>: <span class="hljs-string">&#x27;&#x27;</span>,
<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;followers_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/followers&#x27;</span>,
<span class="hljs-string">&#x27;following_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/following{/other_user}&#x27;</span>,
<span class="hljs-string">&#x27;gists_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/gists{/gist_id}&#x27;</span>,
<span class="hljs-string">&#x27;starred_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}&#x27;</span>,
<span class="hljs-string">&#x27;subscriptions_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/subscriptions&#x27;</span>,
<span class="hljs-string">&#x27;organizations_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/orgs&#x27;</span>,
<span class="hljs-string">&#x27;repos_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/repos&#x27;</span>,
<span class="hljs-string">&#x27;events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/events{/privacy}&#x27;</span>,
<span class="hljs-string">&#x27;received_events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/received_events&#x27;</span>,
<span class="hljs-string">&#x27;type&#x27;</span>: <span class="hljs-string">&#x27;User&#x27;</span>,
<span class="hljs-string">&#x27;site_admin&#x27;</span>: <span class="hljs-literal">False</span>},
<span class="hljs-string">&#x27;labels&#x27;</span>: [],
<span class="hljs-string">&#x27;state&#x27;</span>: <span class="hljs-string">&#x27;open&#x27;</span>,
<span class="hljs-string">&#x27;locked&#x27;</span>: <span class="hljs-literal">False</span>,
<span class="hljs-string">&#x27;assignee&#x27;</span>: <span class="hljs-literal">None</span>,
<span class="hljs-string">&#x27;assignees&#x27;</span>: [],
<span class="hljs-string">&#x27;milestone&#x27;</span>: <span class="hljs-literal">None</span>,
<span class="hljs-string">&#x27;comments&#x27;</span>: <span class="hljs-number">1</span>,
<span class="hljs-string">&#x27;created_at&#x27;</span>: <span class="hljs-string">&#x27;2021-08-12T11:40:18Z&#x27;</span>,
<span class="hljs-string">&#x27;updated_at&#x27;</span>: <span class="hljs-string">&#x27;2021-08-12T12:31:17Z&#x27;</span>,
<span class="hljs-string">&#x27;closed_at&#x27;</span>: <span class="hljs-literal">None</span>,
<span class="hljs-string">&#x27;author_association&#x27;</span>: <span class="hljs-string">&#x27;CONTRIBUTOR&#x27;</span>,
<span class="hljs-string">&#x27;active_lock_reason&#x27;</span>: <span class="hljs-literal">None</span>,
<span class="hljs-string">&#x27;pull_request&#x27;</span>: {<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/pulls/2792&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792&#x27;</span>,
<span class="hljs-string">&#x27;diff_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792.diff&#x27;</span>,
<span class="hljs-string">&#x27;patch_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792.patch&#x27;</span>},
<span class="hljs-string">&#x27;body&#x27;</span>: <span class="hljs-string">&#x27;[GooAQ](https://github.com/allenai/gooaq) dataset was recently updated after splits were added for the same. This PR contains new updated GooAQ with train/val/test splits and updated README as well.&#x27;</span>,
<span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>: <span class="hljs-literal">None</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-w28vk1">Uau, aceasta e o cantitate mare de informație! Putem vedea câmpuri utile cum ar fi <code>title</code>, <code>body</code> și <code>number</code> care descriu problema, precum și informații despre utilizatorul GitHub care a deschis issue-ul.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-sjr1zc">✏️ <strong>Încercați!</strong> Faceți clic pe câteva dintre URL-urile din payload-ul JSON de mai sus pentru a vă familiariza cu tipul de informații către care se face referire pentru fiecare GitHub issue.</p></div> <p data-svelte-h="svelte-1o5tle4">După cum este descris în <a href="https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting" rel="nofollow">documentația</a> GitHub, solicitările neautentificate sunt limitate la 60 de solicitări pe oră. Deși puteți crește <code>per_page</code> query parameter pentru a reduce numărul de solicitări pe care le faceți, oricum veți atinge limita pentru orice repository care are mai mult de câteva mii de issues. Prin urmare, ar trebui să urmați <a href="https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token" rel="nofollow">instrucțiunile</a> GitHub pentru crearea unui <em>personal access token</em> astfel încât să puteți crește limita la 5.000 de solicitări pe oră. Odată ce aveți tokenul, îl puteți include ca parte a request header:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->GITHUB_TOKEN = xxx <span class="hljs-comment"># Copy your GitHub token here</span>
headers = {<span class="hljs-string">&quot;Authorization&quot;</span>: <span class="hljs-string">f&quot;token <span class="hljs-subst">{GITHUB_TOKEN}</span>&quot;</span>}<!-- HTML_TAG_END --></pre></div> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-18ilrx3">⚠️ Nu oferiți nimănui un notebook cu <code>GITHUB_TOKEN</code> în el . Vă recomandăm să ștergeți ultima celulă odată ce ați executat-o pentru a evita scurgerea accidentală a acestor informații. Chiar mai bine, stocați tokenul într-un fișier <em>.env</em> și utilizați biblioteca <code>python-dotenv</code> pentru a îl încărca automat ca variabilă de mediu.</p></div> <p data-svelte-h="svelte-rumg9a">Acum că avem tokenul de acces, hai să creăm o funcție care să poată descărca toate issue-urile dintr-un repositoriu GitHub:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> math
<span class="hljs-keyword">from</span> pathlib <span class="hljs-keyword">import</span> Path
<span class="hljs-keyword">import</span> pandas <span class="hljs-keyword">as</span> pd
<span class="hljs-keyword">from</span> tqdm.notebook <span class="hljs-keyword">import</span> tqdm
<span class="hljs-keyword">def</span> <span class="hljs-title function_">fetch_issues</span>(<span class="hljs-params">
owner=<span class="hljs-string">&quot;huggingface&quot;</span>,
repo=<span class="hljs-string">&quot;datasets&quot;</span>,
num_issues=<span class="hljs-number">10_000</span>,
rate_limit=<span class="hljs-number">5_000</span>,
issues_path=Path(<span class="hljs-params"><span class="hljs-string">&quot;.&quot;</span></span>),
</span>):
<span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> issues_path.is_dir():
issues_path.mkdir(exist_ok=<span class="hljs-literal">True</span>)
batch = []
all_issues = []
per_page = <span class="hljs-number">100</span> <span class="hljs-comment"># Number of issues to return per page</span>
num_pages = math.ceil(num_issues / per_page)
base_url = <span class="hljs-string">&quot;https://api.github.com/repos&quot;</span>
<span class="hljs-keyword">for</span> page <span class="hljs-keyword">in</span> tqdm(<span class="hljs-built_in">range</span>(num_pages)):
<span class="hljs-comment"># Query with state=all to get both open and closed issues</span>
query = <span class="hljs-string">f&quot;issues?page=<span class="hljs-subst">{page}</span>&amp;per_page=<span class="hljs-subst">{per_page}</span>&amp;state=all&quot;</span>
issues = requests.get(<span class="hljs-string">f&quot;<span class="hljs-subst">{base_url}</span>/<span class="hljs-subst">{owner}</span>/<span class="hljs-subst">{repo}</span>/<span class="hljs-subst">{query}</span>&quot;</span>, headers=headers)
batch.extend(issues.json())
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(batch) &gt; rate_limit <span class="hljs-keyword">and</span> <span class="hljs-built_in">len</span>(all_issues) &lt; num_issues:
all_issues.extend(batch)
batch = [] <span class="hljs-comment"># Flush batch for next time period</span>
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Reached GitHub rate limit. Sleeping for one hour ...&quot;</span>)
time.sleep(<span class="hljs-number">60</span> * <span class="hljs-number">60</span> + <span class="hljs-number">1</span>)
all_issues.extend(batch)
df = pd.DataFrame.from_records(all_issues)
df.to_json(<span class="hljs-string">f&quot;<span class="hljs-subst">{issues_path}</span>/<span class="hljs-subst">{repo}</span>-issues.jsonl&quot;</span>, orient=<span class="hljs-string">&quot;records&quot;</span>, lines=<span class="hljs-literal">True</span>)
<span class="hljs-built_in">print</span>(
<span class="hljs-string">f&quot;Downloaded all the issues for <span class="hljs-subst">{repo}</span>! Dataset stored at <span class="hljs-subst">{issues_path}</span>/<span class="hljs-subst">{repo}</span>-issues.jsonl&quot;</span>
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-dvol3l">Acum când apelăm <code>fetch_issues()</code> va descărca toate problemele în batch-uri pentru a evita depășirea limitei GitHub pe numărul de solicitări pe oră; rezultatul va fi stocat într-un fișier <code>_repository_name-issues.jsonl</code>, unde fiecare linie este un obiect JSON care reprezintă un issue. Mai jos folosim această funcție pentru a obține toate issue-urile de la 🤗 Datasets:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># În dependență de conexiunea ta la internet, acest lucru poate dura câteva minute...</span>
fetch_issues()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-kikhs">Odată ce issue-urile sunt descărcate, le putem încărca local utilizând abilitățile noastre dobândite în <a href="/course/chapter5/2">secțiunea 2</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_dataset = load_dataset(<span class="hljs-string">&quot;json&quot;</span>, data_files=<span class="hljs-string">&quot;datasets-issues.jsonl&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
issues_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;url&#x27;</span>, <span class="hljs-string">&#x27;repository_url&#x27;</span>, <span class="hljs-string">&#x27;labels_url&#x27;</span>, <span class="hljs-string">&#x27;comments_url&#x27;</span>, <span class="hljs-string">&#x27;events_url&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;id&#x27;</span>, <span class="hljs-string">&#x27;node_id&#x27;</span>, <span class="hljs-string">&#x27;number&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;user&#x27;</span>, <span class="hljs-string">&#x27;labels&#x27;</span>, <span class="hljs-string">&#x27;state&#x27;</span>, <span class="hljs-string">&#x27;locked&#x27;</span>, <span class="hljs-string">&#x27;assignee&#x27;</span>, <span class="hljs-string">&#x27;assignees&#x27;</span>, <span class="hljs-string">&#x27;milestone&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;created_at&#x27;</span>, <span class="hljs-string">&#x27;updated_at&#x27;</span>, <span class="hljs-string">&#x27;closed_at&#x27;</span>, <span class="hljs-string">&#x27;author_association&#x27;</span>, <span class="hljs-string">&#x27;active_lock_reason&#x27;</span>, <span class="hljs-string">&#x27;pull_request&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>, <span class="hljs-string">&#x27;timeline_url&#x27;</span>, <span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>],
num_rows: <span class="hljs-number">3019</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1kghnkq">Great, am creat primul nostru dataset de la zero! Dar de ce sunt mai mult de câteva mii de issue-uri atunci când tabul de issue-uri al repositoriului 🤗 Datasets afișează doar aproximativ 1.000 de issue-uri în total 🤔? Conform descris în <a href="https://docs.github.com/en/rest/reference/issues#list-issues-assigned-to-the-authenticated-user" rel="nofollow">documentația</a> GitHub, acest lucru s-a întâmplat pentru că am descărcat și toate pull requesturile:</p> <blockquote data-svelte-h="svelte-165ve2g"><p>GitHub’s REST API v3 considers every pull request an issue, but not every issue is a pull request. For this reason, “Issues” endpoints may return both issues and pull requests in the response. You can identify pull requests by the <code>pull_request</code> key. Be aware that the <code>id</code> of a pull request returned from “Issues” endpoints will be an issue id.</p></blockquote> <p data-svelte-h="svelte-37i03s">Deoarece conținutul issue-urilor și pull requesturilor este destul de diferit, hai să preprocesăm puțin datele pentru a ne permite să le diferențiem între ele.</p> <h2 class="relative group"><a id="cleaning-up-the-data" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#cleaning-up-the-data"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Curățarea datelor</span></h2> <p data-svelte-h="svelte-xtwl00">Fragmentul de mai sus din documentația GitHub ne spune că coloana <code>pull_request</code> poate fi utilizată pentru a diferenția între issues și pull requests. Să analizăm un sampple aleatoriu pentru a vedea care este diferența. Așa cum am făcut în <a href="/course/chapter5/3">secțiunea 3</a>, vom înlănțui <code>Dataset.shuffle()</code> și <code>Dataset.select()</code> pentru a crea un sample aleatoriu și apoi vom împerechea coloanele <code>html_url</code> și <code>pull_request</code> pentru a putea compara diversele URL-uri:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->sample = issues_dataset.shuffle(seed=<span class="hljs-number">666</span>).select(<span class="hljs-built_in">range</span>(<span class="hljs-number">3</span>))
<span class="hljs-comment"># Print out the URL and pull request entries</span>
<span class="hljs-keyword">for</span> url, pr <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(sample[<span class="hljs-string">&quot;html_url&quot;</span>], sample[<span class="hljs-string">&quot;pull_request&quot;</span>]):
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;&gt;&gt; URL: <span class="hljs-subst">{url}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;&gt;&gt; Pull request: <span class="hljs-subst">{pr}</span>\n&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; URL: https://github.com/huggingface/datasets/pull/<span class="hljs-number">850</span>
&gt;&gt; Pull request: {<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/pulls/850&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/850&#x27;</span>, <span class="hljs-string">&#x27;diff_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/850.diff&#x27;</span>, <span class="hljs-string">&#x27;patch_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/850.patch&#x27;</span>}
&gt;&gt; URL: https://github.com/huggingface/datasets/issues/<span class="hljs-number">2773</span>
&gt;&gt; Pull request: <span class="hljs-literal">None</span>
&gt;&gt; URL: https://github.com/huggingface/datasets/pull/<span class="hljs-number">783</span>
&gt;&gt; Pull request: {<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/pulls/783&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/783&#x27;</span>, <span class="hljs-string">&#x27;diff_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/783.diff&#x27;</span>, <span class="hljs-string">&#x27;patch_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/783.patch&#x27;</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1djz2j8">Aici putem vedea că fiecare pull request este asociat cu diverse URL-uri, în timp ce issue-urile obișnuite au o intrare <code>None</code>. Putem utiliza această distincție pentru a crea o nouă coloană <code>is_pull_request</code> care verifică dacă câmpul <code>pull_request</code> este <code>None</code> sau nu:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_dataset = issues_dataset.<span class="hljs-built_in">map</span>(
<span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">&quot;is_pull_request&quot;</span>: <span class="hljs-literal">False</span> <span class="hljs-keyword">if</span> x[<span class="hljs-string">&quot;pull_request&quot;</span>] <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">else</span> <span class="hljs-literal">True</span>}
)<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1l6kipy">✏️ <strong>Încercați!</strong> Calculați timpul mediu necesar pentru închiderea issue-urilor în Datasets. Vă poate fi utilă funcția <code>Dataset.filter()</code> pentru a filtra pull requesturile și issue-urile deschise, și puteți utiliza funcția <code>Dataset.set_format()</code> pentru a converti datasetul într-un <code>DataFrame</code> astfel încât să puteți manipula cu ușurință timestampurile <code>created_at</code> și <code>closed_at</code>. Pentru puncte bonus, calculați timpul mediu necesar pentru închiderea pull requesturilor.</p></div> <p data-svelte-h="svelte-1pc596d">Deși am putea continua să curățăm datasetul prin eliminarea sau redenumirea unor coloane, este, în general, o practică bună să păstrăm datasetul cât mai “raw” posibil la acest stadiu, astfel încât să poată fi utilizat ușor în multiple aplicații.</p> <p data-svelte-h="svelte-11000t7">Înainte de a încărca datasetul în Hugging Face Hub, trebuie să rezolvăm chestie care lipsește din el: comentariile asociate fiecărui issue și pull request. Le vom adăuga în continuare cu— ați ghicit — GitHub REST API!</p> <h2 class="relative group"><a id="augmenting-the-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#augmenting-the-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Îmbunătățirea datasetului</span></h2> <p data-svelte-h="svelte-1t6wt29">După cum se vede în următorul screenshot, comentariile asociate unui issue sau pull request oferă o sursă bogată de informații, în special dacă suntem interesați să construim un motor de căutare pentru a răspunde la întrebările utilizatorilor despre bibliotecă.</p> <div class="flex justify-center" data-svelte-h="svelte-1qll6ft"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-issues-comment.png" alt="Comentariile asociate unei probleme despre 🤗 Datasets." width="80%"></div> <p data-svelte-h="svelte-pa5erq">GitHub REST API oferă un endpoint <a href="https://docs.github.com/en/rest/reference/issues#list-issue-comments" rel="nofollow"><code>Comments</code></a> care returnează toate comentariile asociate numărului problemei. Să testăm endpointul pentru a vedea ce returnează:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issue_number = <span class="hljs-number">2792</span>
url = <span class="hljs-string">f&quot;https://api.github.com/repos/huggingface/datasets/issues/<span class="hljs-subst">{issue_number}</span>/comments&quot;</span>
response = requests.get(url, headers=headers)
response.json()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/comments/897594128&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792#issuecomment-897594128&#x27;</span>,
<span class="hljs-string">&#x27;issue_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792&#x27;</span>,
<span class="hljs-string">&#x27;id&#x27;</span>: <span class="hljs-number">897594128</span>,
<span class="hljs-string">&#x27;node_id&#x27;</span>: <span class="hljs-string">&#x27;IC_kwDODunzps41gDMQ&#x27;</span>,
<span class="hljs-string">&#x27;user&#x27;</span>: {<span class="hljs-string">&#x27;login&#x27;</span>: <span class="hljs-string">&#x27;bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;id&#x27;</span>: <span class="hljs-number">19718818</span>,
<span class="hljs-string">&#x27;node_id&#x27;</span>: <span class="hljs-string">&#x27;MDQ6VXNlcjE5NzE4ODE4&#x27;</span>,
<span class="hljs-string">&#x27;avatar_url&#x27;</span>: <span class="hljs-string">&#x27;https://avatars.githubusercontent.com/u/19718818?v=4&#x27;</span>,
<span class="hljs-string">&#x27;gravatar_id&#x27;</span>: <span class="hljs-string">&#x27;&#x27;</span>,
<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;followers_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/followers&#x27;</span>,
<span class="hljs-string">&#x27;following_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/following{/other_user}&#x27;</span>,
<span class="hljs-string">&#x27;gists_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/gists{/gist_id}&#x27;</span>,
<span class="hljs-string">&#x27;starred_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}&#x27;</span>,
<span class="hljs-string">&#x27;subscriptions_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/subscriptions&#x27;</span>,
<span class="hljs-string">&#x27;organizations_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/orgs&#x27;</span>,
<span class="hljs-string">&#x27;repos_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/repos&#x27;</span>,
<span class="hljs-string">&#x27;events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/events{/privacy}&#x27;</span>,
<span class="hljs-string">&#x27;received_events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/received_events&#x27;</span>,
<span class="hljs-string">&#x27;type&#x27;</span>: <span class="hljs-string">&#x27;User&#x27;</span>,
<span class="hljs-string">&#x27;site_admin&#x27;</span>: <span class="hljs-literal">False</span>},
<span class="hljs-string">&#x27;created_at&#x27;</span>: <span class="hljs-string">&#x27;2021-08-12T12:21:52Z&#x27;</span>,
<span class="hljs-string">&#x27;updated_at&#x27;</span>: <span class="hljs-string">&#x27;2021-08-12T12:31:17Z&#x27;</span>,
<span class="hljs-string">&#x27;author_association&#x27;</span>: <span class="hljs-string">&#x27;CONTRIBUTOR&#x27;</span>,
<span class="hljs-string">&#x27;body&#x27;</span>: <span class="hljs-string">&quot;@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = &#x27;gooaq&#x27;\r\n\r\n def test_load_dataset(self, dataset_name):\r\n configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n&gt; self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n self.parent.assertTrue(len(dataset[split]) &gt; 0)\r\nE AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?&quot;</span>,
<span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>: <span class="hljs-literal">None</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xjklhl">Putem vedea că comentariul este stocat în câmpul <code>body</code>, așa că putem scrie o funcție simplă care returnează toate comentariile asociate unei probleme prin extragerea conținutului <code>body</code> pentru fiecare element în <code>response.json()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_comments</span>(<span class="hljs-params">issue_number</span>):
url = <span class="hljs-string">f&quot;https://api.github.com/repos/huggingface/datasets/issues/<span class="hljs-subst">{issue_number}</span>/comments&quot;</span>
response = requests.get(url, headers=headers)
<span class="hljs-keyword">return</span> [r[<span class="hljs-string">&quot;body&quot;</span>] <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> response.json()]
<span class="hljs-comment"># Testăm dacă funcția lucrează cum ne dorim</span>
get_comments(<span class="hljs-number">2792</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&quot;@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = &#x27;gooaq&#x27;\r\n\r\n def test_load_dataset(self, dataset_name):\r\n configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n&gt; self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n self.parent.assertTrue(len(dataset[split]) &gt; 0)\r\nE AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?&quot;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-sqy8v7">Arată bine. Acum hai să folosim <code>Dataset.map()</code> pentru a adăuga noi coloane <code>comments</code> fiecărui issue în datasetul nostru:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Depending on your internet connection, this can take a few minutes...</span>
issues_with_comments_dataset = issues_dataset.<span class="hljs-built_in">map</span>(
<span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">&quot;comments&quot;</span>: get_comments(x[<span class="hljs-string">&quot;number&quot;</span>])}
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-18vozdy">Ultimul pas este să facem push datasetului nostru pe Hub. Să vedem cum putem face asta.</p> <h2 class="relative group"><a id="uploading-the-dataset-to-the-hugging-face-hub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#uploading-the-dataset-to-the-hugging-face-hub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Încărcarea datasetului pe Hugging Face Hub</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/HaN6qCr_Afc" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-19srtem">Acum că avem datasetul nostru augmentat, este timpul să îi facem push pe Hub pentru a-l oferi comunității! Încărcarea unui dataset este foarte simplu: la fel ca modelele și tokenizerrii din 🤗 Transformers, putem utiliza o metodă <code>push_to_hub()</code> pentru a face push unui dataset. Pentru a face asta, avem nevoie de un token de autentificare, care poate fi obținut prin autentificarea pe Hugging Face Hub cu funcția <code>notebook_login()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login
notebook_login()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1d62pml">Acest lucru va crea un widget unde poți să scrii usernameul și parola ta, iar un API token va fi salvat în <em>~/.huggingface/token</em>. Dacă rulezi codeul într-un terminal, te poți loga cu ajutor CLI:
This will create a widget where you can enter your username and password, and an API token will be saved in <em>~/.huggingface/token</em>. If you’re running the code in a terminal, you can log in via the CLI instead:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->huggingface-cli login<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-qhk020">O dată ce ai făcut asta, putem încărca datasetul rulând:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_with_comments_dataset.push_to_hub(<span class="hljs-string">&quot;github-issues&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1f23lsk">De acum, orice poate să descarce datasetul, utilizând <code>load_dataset()</code> cu ID-ul repositoriului ca <code>path</code> argument:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->remote_dataset = load_dataset(<span class="hljs-string">&quot;lewtun/github-issues&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
remote_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;url&#x27;</span>, <span class="hljs-string">&#x27;repository_url&#x27;</span>, <span class="hljs-string">&#x27;labels_url&#x27;</span>, <span class="hljs-string">&#x27;comments_url&#x27;</span>, <span class="hljs-string">&#x27;events_url&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;id&#x27;</span>, <span class="hljs-string">&#x27;node_id&#x27;</span>, <span class="hljs-string">&#x27;number&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;user&#x27;</span>, <span class="hljs-string">&#x27;labels&#x27;</span>, <span class="hljs-string">&#x27;state&#x27;</span>, <span class="hljs-string">&#x27;locked&#x27;</span>, <span class="hljs-string">&#x27;assignee&#x27;</span>, <span class="hljs-string">&#x27;assignees&#x27;</span>, <span class="hljs-string">&#x27;milestone&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;created_at&#x27;</span>, <span class="hljs-string">&#x27;updated_at&#x27;</span>, <span class="hljs-string">&#x27;closed_at&#x27;</span>, <span class="hljs-string">&#x27;author_association&#x27;</span>, <span class="hljs-string">&#x27;active_lock_reason&#x27;</span>, <span class="hljs-string">&#x27;pull_request&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>, <span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>, <span class="hljs-string">&#x27;is_pull_request&#x27;</span>],
num_rows: <span class="hljs-number">2855</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1nh3ov3">Cool, am încărcat datasetul nostru pe Hub și acum este disponibil pentru alții să îl utilizeze! Mai este doar un lucru important de făcut: adăugarea unui <em>dataset card</em> care explică cum a fost creat corpusul și oferă alte informații utile pentru comunitate.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-vijdga">💡 De asemenea, puteți încărca un dataset pe Hugging Face Hub direct din terminal utilizând <code>huggingface-cli</code> și puțină magie Git. Consultați <a href="https://huggingface.co/docs/datasets/share#share-a-dataset-using-the-cli" rel="nofollow">ghidul 🤗 Datasets</a> pentru detalii despre cum puteți face asta.</p></div> <h2 class="relative group"><a id="creating-a-dataset-card" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#creating-a-dataset-card"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Crearea unei dataset card</span></h2> <p data-svelte-h="svelte-1v8gyxc">Datasetiroșe bine documentate sunt mai probabil să fie utile altora (inclusiv ție din viitor!), deoarece furnizează contextul pentru a permite utilizatorilor să decidă dacă datasetul este relevant pentru taskul lor și să evalueze eventualele biasuri sau riscurile asociate cu utilizarea datasetului.</p> <p data-svelte-h="svelte-1yj0d15">Pe Hugging Face Hub, această informație este stocată în fișierul <em>README.md</em> al fiecărui dataset repository. Sunt doi pași principali pe care trebuie să îi efectuați înainte de a crea acest fișier:</p> <ol data-svelte-h="svelte-13pn4mo"><li>Utilizați aplicația <a href="https://huggingface.co/datasets/tagging/" rel="nofollow"><code>datasets-tagging</code></a> pentru a crea etichete de metadate în format YAML. Aceste taguri sunt utilizate pentru o varietate de funcționalități de căutare pe Hugging Face Hub și asigură că datasetul poate fi găsit ușor de membrii comunității. Deoarece am creat un dataset custom aici, veți fi nevoiți să clonați repositoriul <code>datasets-tagging</code> și să rulați aplicația local. Iată cum arată interfața:</li></ol> <div class="flex justify-center" data-svelte-h="svelte-s62rok"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-tagger.png" alt="Interfața 'datasets-tagging'." width="80%"></div> <ol start="2" data-svelte-h="svelte-17itww3"><li>Citiți <a href="https://github.com/huggingface/datasets/blob/master/templates/README_guide.md" rel="nofollow">ghidul 🤗 Datasets</a> despre crearea de dataset cards informative și utilizați-l ca șablon.</li></ol> <p data-svelte-h="svelte-c0261d">Puteți crea fișierul <em>README.md</em> direct pe Hub și puteți găsi un template pentru dataset card în repositoriul <code>lewtun/github-issues</code>. Un screenshot a dataset card completată este afișată mai jos.</p> <div class="flex justify-center" data-svelte-h="svelte-1twscot"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/dataset-card.png" alt="Dataset card." width="80%"></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-17ir9t2">✏️ <strong>Încercați!</strong> Utilizați aplicația <code>dataset-tagging</code> și <a href="https://github.com/huggingface/datasets/blob/master/templates/README_guide.md" rel="nofollow">ghidul 🤗 Datasets</a> pentru a completa fișierul <em>README.md</em> pentru datasetul de probleme GitHub.</p></div> <p data-svelte-h="svelte-pd26ss">Astfel, am văzut în această secțiune că crearea unui dataset bun poate fi destul de complicată, dar, spre norocul nsotru, încărcarea și oferirea acestuia comunității nu sunt. În secțiunea următoare, vom utiliza datasetul nou pentru a crea un motor de căutare semantic cu 🤗 Datasets care poate să asocieze întrebări cu cele mai relevante issues și comentarii.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1cvtjan">✏️ <strong>Încercați!</strong> Treceți prin pașii pe care i-am făcut în această secțiune pentru a crea un dataset de issues GitHub pentru o biblioteca open source care îți place(alegeți altceva înafară de 🤗 Datasets, desigur!). Pentru puncte bonus, faceți fine-tune unui multilabel classifier pentru a prezice tagurile prezente în câmpul <code>labels</code>.</p></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/rum/chapter5/5.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1ftlxhy = {
assets: "/docs/course/pr_1069/rum",
base: "/docs/course/pr_1069/rum",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1069/rum/_app/immutable/entry/start.1de7c3d2.js"),
import("/docs/course/pr_1069/rum/_app/immutable/entry/app.1f82014c.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 61],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
85.8 kB
·
Xet hash:
4387e5bc145fcfcc2a076ca8e18d14241a6354596a0af83319bf705bbe48ab4b

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.