Buckets:

rtrm's picture
download
raw
99.4 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Creare il proprio dataset&quot;,&quot;local&quot;:&quot;creare-il-proprio-dataset&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Recuperare i dati&quot;,&quot;local&quot;:&quot;recuperare-i-dati&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Pulire i dati&quot;,&quot;local&quot;:&quot;pulire-i-dati&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Estendere il dataset&quot;,&quot;local&quot;:&quot;estendere-il-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Caricare il dataset sull’Hub Hugging Face&quot;,&quot;local&quot;:&quot;caricare-il-dataset-sullhub-hugging-face&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Creare una dataset card&quot;,&quot;local&quot;:&quot;creare-una-dataset-card&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1069/it/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/entry/start.693d748d.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/scheduler.37c15a92.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/singletons.60b4c7a2.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/index.18351ede.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/paths.43b6516c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/entry/app.e9cfd099.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/index.2bf4358c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/nodes/0.bb8a536c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/nodes/37.0efd97d7.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/Tip.363c041f.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/Youtube.1e50a667.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/CodeBlock.4e987730.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/CourseFloatingBanner.9ff4c771.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/DocNotebookDropdown.efc1fb7c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/getInferenceSnippets.24b50994.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Creare il proprio dataset&quot;,&quot;local&quot;:&quot;creare-il-proprio-dataset&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Recuperare i dati&quot;,&quot;local&quot;:&quot;recuperare-i-dati&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Pulire i dati&quot;,&quot;local&quot;:&quot;pulire-i-dati&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Estendere il dataset&quot;,&quot;local&quot;:&quot;estendere-il-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Caricare il dataset sull’Hub Hugging Face&quot;,&quot;local&quot;:&quot;caricare-il-dataset-sullhub-hugging-face&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Creare una dataset card&quot;,&quot;local&quot;:&quot;creare-una-dataset-card&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="creare-il-proprio-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#creare-il-proprio-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Creare il proprio dataset</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-5-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/it/chapter5/section5.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/it/chapter5/section5.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-1h3t35x">A volte il dataset che ti serve per la tua applicazione NLP non esiste, per cui dovrai crearlo da te. In questa sezione ti mostreremo come creare un corpus di <a href="https://github.com/features/issues" rel="nofollow">issue da GitHub</a>, usate solitamente per tenere traccia dei bug e delle feature nelle repository su GitHub. Questo corpus può essere usato in diversi modi, ad esempio:</p> <ul data-svelte-h="svelte-qe2w3o"><li>Esplorare il tempo impiegato per chiudere un issue, o per effettuare dei pull</li> <li>Addestrare un <em>classificatore multiclasse</em> che assegna a ogni issue dei metadati sulla base della descrizione dell’issue (ad esempio, “bug”, “enhancement”, “question”)</li> <li>Creare un motore di ricerca semantico per trovare quale issue corrisponde a una richiesta dell’utente</li></ul> <p data-svelte-h="svelte-dydfa0">Ci focalizzeremo sulla creazione del corpus, e nella prossima sezione affronteremo la creazione di un motore di ricerca semantico. Useremo gli issue GitHub associate a un progetto open source molto popolare: 🤗 Datasets! Diamo un’occhiata a come recuperare i dati e come esplorare le informazioni contenute negli issue.</p> <h2 class="relative group"><a id="recuperare-i-dati" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#recuperare-i-dati"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Recuperare i dati</span></h2> <p data-svelte-h="svelte-190qwz0">Puoi trovare tutte gli issue in 🤗 Datasets navigando nella <a href="https://github.com/huggingface/datasets/issues" rel="nofollow">sezione Issues della repository</a>. Come si vede dallo screenshot, al momento della scrittura c’erano 331 issue aperti e 668 issue chiusi.</p> <div class="flex justify-center" data-svelte-h="svelte-1htetkm"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-issues.png" alt="The GitHub issues associated with 🤗 Datasets." width="80%"></div> <p data-svelte-h="svelte-1yipl9d">Se clicchi su una di questi issue vedrai che contiene un titolo, una descrizione, e un set di etichette che caratterizzano l’issue. Un esempio è mostrato nello screenshot successivo.</p> <div class="flex justify-center" data-svelte-h="svelte-1jsgvzc"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-issues-single.png" alt="A typical GitHub issue in the 🤗 Datasets repository." width="80%"></div> <p data-svelte-h="svelte-17sk5bj">Per scaricare gli issue della repository, useremo la <a href="https://docs.github.com/en/rest" rel="nofollow">REST API di GitHub</a> per interrogare l’<a href="https://docs.github.com/en/rest/reference/issues#list-repository-issues" rel="nofollow">endpoint <code>Issues</code></a>. Questo endpoint restituisce una lista di oggetti JSON, e ogni oggetto contiene un gran numero di campi, tra cui il titolo e la descrizione, così come dei metadati circo lo status dell’issue e altro ancora.</p> <p data-svelte-h="svelte-58mesu">Una maniera conveniente di scaricare gli issue è attraverso la libreria <code>requests</code>, che rappresenta il metodo standard di fare richieste HTTP su Python. Puoi installa la libreria attraverso il codice:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!pip install requests<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-linx15">Una volta che la libreria è stata installata, puoi effettuare una richiesta GET all’endpoint <code>Issues</code> utilizzando la funzione <code>requests.get()</code>. Ad esempio, puoi eseguire il comando mostrato di seguito per recuperare il primo issue nella prima pagina:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> requests
url = <span class="hljs-string">&quot;https://api.github.com/repos/huggingface/datasets/issues?page=1&amp;per_page=1&quot;</span>
response = requests.get(url)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-hwpfvr">L’oggetto <code>response</code> contiene un sacco di informazioni utili sulla richiesta, compreso il codice di stato HTTP:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->response.status_code<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">200</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-n5f90">Lo status <code>200</code> indica che la richiesta ha avuto buon fine (puoi trovare una lista di codici di stato HTTTP <a href="https://it.wikipedia.org/wiki/Codici_di_stato_HTTP" rel="nofollow">qui</a>). Ma ciò che ci interessa davvero è il <em>payload</em>, a cui è possibile accedere utilizzando diversi formati come byte, stringh, o JSON. Visto che sappiamo che i nostri issue sono in formato JSON, diamo un’occhiata al payload come segue:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->response.json()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792&#x27;</span>,
<span class="hljs-string">&#x27;repository_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets&#x27;</span>,
<span class="hljs-string">&#x27;labels_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792/labels{/name}&#x27;</span>,
<span class="hljs-string">&#x27;comments_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792/comments&#x27;</span>,
<span class="hljs-string">&#x27;events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792/events&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792&#x27;</span>,
<span class="hljs-string">&#x27;id&#x27;</span>: <span class="hljs-number">968650274</span>,
<span class="hljs-string">&#x27;node_id&#x27;</span>: <span class="hljs-string">&#x27;MDExOlB1bGxSZXF1ZXN0NzEwNzUyMjc0&#x27;</span>,
<span class="hljs-string">&#x27;number&#x27;</span>: <span class="hljs-number">2792</span>,
<span class="hljs-string">&#x27;title&#x27;</span>: <span class="hljs-string">&#x27;Update GooAQ&#x27;</span>,
<span class="hljs-string">&#x27;user&#x27;</span>: {<span class="hljs-string">&#x27;login&#x27;</span>: <span class="hljs-string">&#x27;bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;id&#x27;</span>: <span class="hljs-number">19718818</span>,
<span class="hljs-string">&#x27;node_id&#x27;</span>: <span class="hljs-string">&#x27;MDQ6VXNlcjE5NzE4ODE4&#x27;</span>,
<span class="hljs-string">&#x27;avatar_url&#x27;</span>: <span class="hljs-string">&#x27;https://avatars.githubusercontent.com/u/19718818?v=4&#x27;</span>,
<span class="hljs-string">&#x27;gravatar_id&#x27;</span>: <span class="hljs-string">&#x27;&#x27;</span>,
<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;followers_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/followers&#x27;</span>,
<span class="hljs-string">&#x27;following_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/following{/other_user}&#x27;</span>,
<span class="hljs-string">&#x27;gists_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/gists{/gist_id}&#x27;</span>,
<span class="hljs-string">&#x27;starred_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}&#x27;</span>,
<span class="hljs-string">&#x27;subscriptions_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/subscriptions&#x27;</span>,
<span class="hljs-string">&#x27;organizations_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/orgs&#x27;</span>,
<span class="hljs-string">&#x27;repos_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/repos&#x27;</span>,
<span class="hljs-string">&#x27;events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/events{/privacy}&#x27;</span>,
<span class="hljs-string">&#x27;received_events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/received_events&#x27;</span>,
<span class="hljs-string">&#x27;type&#x27;</span>: <span class="hljs-string">&#x27;User&#x27;</span>,
<span class="hljs-string">&#x27;site_admin&#x27;</span>: <span class="hljs-literal">False</span>},
<span class="hljs-string">&#x27;labels&#x27;</span>: [],
<span class="hljs-string">&#x27;state&#x27;</span>: <span class="hljs-string">&#x27;open&#x27;</span>,
<span class="hljs-string">&#x27;locked&#x27;</span>: <span class="hljs-literal">False</span>,
<span class="hljs-string">&#x27;assignee&#x27;</span>: <span class="hljs-literal">None</span>,
<span class="hljs-string">&#x27;assignees&#x27;</span>: [],
<span class="hljs-string">&#x27;milestone&#x27;</span>: <span class="hljs-literal">None</span>,
<span class="hljs-string">&#x27;comments&#x27;</span>: <span class="hljs-number">1</span>,
<span class="hljs-string">&#x27;created_at&#x27;</span>: <span class="hljs-string">&#x27;2021-08-12T11:40:18Z&#x27;</span>,
<span class="hljs-string">&#x27;updated_at&#x27;</span>: <span class="hljs-string">&#x27;2021-08-12T12:31:17Z&#x27;</span>,
<span class="hljs-string">&#x27;closed_at&#x27;</span>: <span class="hljs-literal">None</span>,
<span class="hljs-string">&#x27;author_association&#x27;</span>: <span class="hljs-string">&#x27;CONTRIBUTOR&#x27;</span>,
<span class="hljs-string">&#x27;active_lock_reason&#x27;</span>: <span class="hljs-literal">None</span>,
<span class="hljs-string">&#x27;pull_request&#x27;</span>: {<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/pulls/2792&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792&#x27;</span>,
<span class="hljs-string">&#x27;diff_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792.diff&#x27;</span>,
<span class="hljs-string">&#x27;patch_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792.patch&#x27;</span>},
<span class="hljs-string">&#x27;body&#x27;</span>: <span class="hljs-string">&#x27;[GooAQ](https://github.com/allenai/gooaq) dataset was recently updated after splits were added for the same. This PR contains new updated GooAQ with train/val/test splits and updated README as well.&#x27;</span>,
<span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>: <span class="hljs-literal">None</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-bmsgi4">Wow, quante informazioni! Possiamo vedere alcuni campi utili come <code>title</code>, <code>body</code> e <code>number</code> che descrivono l’issue, così come informazioni sull’utente che l’ha aperto.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-ql7iz5">✏️ <strong>Prova tu!</strong> Clicca su alcuni degli URL nel payload JSON per farti un’idea del tipo di informazione a cui è collegato ogni issue GitHub.</p></div> <p data-svelte-h="svelte-jjlpsx">Come descritto nella <a href="https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting" rel="nofollow">documentazione di GitHub</a>, le richieste senza autenticazione sono limitate a 60 ogni ora. Benché possiamo aumentare il parametro della query <code>per_page</code> per ridurre il numero di richieste, raggiungerai comunque il limite su qualunque repository che ha qualche migliaio di issue. Quindi, dovresti seguire le <a href="https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token" rel="nofollow">istruzioni</a> su come creare un <em>token di accesso personale</em> così che puoi aumentare il limite a 5.000 richieste ogni ora. Una volta che hai ottenuto il tuo token, puoi includerlo come parte dell’header della richiesta:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->GITHUB_TOKEN = xxx <span class="hljs-comment"># inserisci qui il tuo token GitHub</span>
headers = {<span class="hljs-string">&quot;Authorization&quot;</span>: <span class="hljs-string">f&quot;token <span class="hljs-subst">{GITHUB_TOKEN}</span>&quot;</span>}<!-- HTML_TAG_END --></pre></div> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-bmnegk">⚠️ Fai attenzione a non condividere un notebook con il tuo <code>GITHUB_TOKEN</code> al suo interno. Ti consigliamo di cancellare l’ultima cella una volta che l’hai eseguita per evitare di far trapelare quest’informazione accidentalmente. Meglio ancora, salva il tuo token in un file <em>.env</em> e usa la <a href="https://github.com/theskumar/python-dotenv" rel="nofollow">libreria <code>python-dotenv</code></a> per caricarlo automaticamente come una variabile d’ambiente.</p></div> <p data-svelte-h="svelte-ge0v0t">Ora che abbiamo il nostro token di accesso, creiamo una funzione che scarichi tutti gli issue da una repository GitHub:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> math
<span class="hljs-keyword">from</span> pathlib <span class="hljs-keyword">import</span> Path
<span class="hljs-keyword">import</span> pandas <span class="hljs-keyword">as</span> pd
<span class="hljs-keyword">from</span> tqdm.notebook <span class="hljs-keyword">import</span> tqdm
<span class="hljs-keyword">def</span> <span class="hljs-title function_">fetch_issues</span>(<span class="hljs-params">
owner=<span class="hljs-string">&quot;huggingface&quot;</span>,
repo=<span class="hljs-string">&quot;datasets&quot;</span>,
num_issues=<span class="hljs-number">10_000</span>,
rate_limit=<span class="hljs-number">5_000</span>,
issues_path=Path(<span class="hljs-params"><span class="hljs-string">&quot;.&quot;</span></span>),
</span>):
<span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> issues_path.is_dir():
issues_path.mkdir(exist_ok=<span class="hljs-literal">True</span>)
batch = []
all_issues = []
per_page = <span class="hljs-number">100</span> <span class="hljs-comment"># Numero di issue da restituire per pagina</span>
num_pages = math.ceil(num_issues / per_page)
base_url = <span class="hljs-string">&quot;https://api.github.com/repos&quot;</span>
<span class="hljs-keyword">for</span> page <span class="hljs-keyword">in</span> tqdm(<span class="hljs-built_in">range</span>(num_pages)):
<span class="hljs-comment"># La query ha state=all per ottenere sia gli issue aperti che quelli chiusi</span>
query = <span class="hljs-string">f&quot;issues?page=<span class="hljs-subst">{page}</span>&amp;per_page=<span class="hljs-subst">{per_page}</span>&amp;state=all&quot;</span>
issues = requests.get(<span class="hljs-string">f&quot;<span class="hljs-subst">{base_url}</span>/<span class="hljs-subst">{owner}</span>/<span class="hljs-subst">{repo}</span>/<span class="hljs-subst">{query}</span>&quot;</span>, headers=headers)
batch.extend(issues.json())
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(batch) &gt; rate_limit <span class="hljs-keyword">and</span> <span class="hljs-built_in">len</span>(all_issues) &lt; num_issues:
all_issues.extend(batch)
batch = [] <span class="hljs-comment"># puliamo la batch per il termine successivo</span>
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Reached GitHub rate limit. Sleeping for one hour ...&quot;</span>)
time.sleep(<span class="hljs-number">60</span> * <span class="hljs-number">60</span> + <span class="hljs-number">1</span>)
all_issues.extend(batch)
df = pd.DataFrame.from_records(all_issues)
df.to_json(<span class="hljs-string">f&quot;<span class="hljs-subst">{issues_path}</span>/<span class="hljs-subst">{repo}</span>-issues.jsonl&quot;</span>, orient=<span class="hljs-string">&quot;records&quot;</span>, lines=<span class="hljs-literal">True</span>)
<span class="hljs-built_in">print</span>(
<span class="hljs-string">f&quot;Downloaded all the issues for <span class="hljs-subst">{repo}</span>! Dataset stored at <span class="hljs-subst">{issues_path}</span>/<span class="hljs-subst">{repo}</span>-issues.jsonl&quot;</span>
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-12lrurn">Ora quando eseguiremo <code>fetch_issues()</code>, scaricherà tutti gli issue in batch per evitare di superare il limite di GitHub del numero di richieste per ora; il risultato sarà conservato in un file <em>repository_name-issues.jsonl</em>, in cui ogni linea è un oggetto JSON che rappresenta un issue. Usiamo questa funzione per recuperare tutti gli issue da 🤗 Datasets:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># A seconda della tua connessione internet, ci potrebbe volere qualche secondo...</span>
fetch_issues()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1fepqzy">Una volta che gli issue sono stati scaricati, possiamo caricarli in locale usando le nuove abilità imparate nella <a href="/course/chapter5/2">sezione 2</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_dataset = load_dataset(<span class="hljs-string">&quot;json&quot;</span>, data_files=<span class="hljs-string">&quot;datasets-issues.jsonl&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
issues_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;url&#x27;</span>, <span class="hljs-string">&#x27;repository_url&#x27;</span>, <span class="hljs-string">&#x27;labels_url&#x27;</span>, <span class="hljs-string">&#x27;comments_url&#x27;</span>, <span class="hljs-string">&#x27;events_url&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;id&#x27;</span>, <span class="hljs-string">&#x27;node_id&#x27;</span>, <span class="hljs-string">&#x27;number&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;user&#x27;</span>, <span class="hljs-string">&#x27;labels&#x27;</span>, <span class="hljs-string">&#x27;state&#x27;</span>, <span class="hljs-string">&#x27;locked&#x27;</span>, <span class="hljs-string">&#x27;assignee&#x27;</span>, <span class="hljs-string">&#x27;assignees&#x27;</span>, <span class="hljs-string">&#x27;milestone&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;created_at&#x27;</span>, <span class="hljs-string">&#x27;updated_at&#x27;</span>, <span class="hljs-string">&#x27;closed_at&#x27;</span>, <span class="hljs-string">&#x27;author_association&#x27;</span>, <span class="hljs-string">&#x27;active_lock_reason&#x27;</span>, <span class="hljs-string">&#x27;pull_request&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>, <span class="hljs-string">&#x27;timeline_url&#x27;</span>, <span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>],
num_rows: <span class="hljs-number">3019</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1wbhe20">Benissimo, abbiamo creato il nostro primo dataset da zero! Ma perché ci sono migliaia di issue quando la <a href="https://github.com/huggingface/datasets/issues" rel="nofollow">sezione Issues</a> della repository 🤗 Datasets mostra circa 1,000 issue in totale 🤔? Come indicato nella <a href="https://docs.github.com/en/rest/reference/issues#list-issues-assigned-to-the-authenticated-user" rel="nofollow">documentazione di GitHub</a>, è perché abbiamo scaricato anche le richieste di pull:</p> <blockquote data-svelte-h="svelte-165ve2g"><p>GitHub’s REST API v3 considers every pull request an issue, but not every issue is a pull request. For this reason, “Issues” endpoints may return both issues and pull requests in the response. You can identify pull requests by the <code>pull_request</code> key. Be aware that the <code>id</code> of a pull request returned from “Issues” endpoints will be an issue id.</p></blockquote> <p data-svelte-h="svelte-17d1r3t">(<em>La REST API v3 di GitHub considera ogni richiesta di pull un issue, ma non ogni issue è una richiesta di pull. Per questa ragione, gli endpoint “Issues” potrebbe tornare sia gli issue che le richieste di pull. È possibile identificare le richieste di pull utilizzando la chiave <code>pull_request</code>. Tieni presente che l’<code>id</code> di una richiesta di pull resituita dagli endpoint <code>Issues</code> sarà un id di un issue.</em>)</p> <p data-svelte-h="svelte-1uhjti4">Poichè i contenuti degli issue e delle richieste di pull sono molto diversi, facciamo un po’ di preprocessing per permetterci di distinguere tra i due.</p> <h2 class="relative group"><a id="pulire-i-dati" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#pulire-i-dati"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Pulire i dati</span></h2> <p data-svelte-h="svelte-5irsew">Il frammento precedente della documentazione di GitHub ci dice che la colonna <code>pull_request</code> può essere utilizzata per distinguere gli issue e le richieste di pull. Diamo uno sguardo a un esempio casuale per vedere qual è la differenza. Come abbiamo fatto nella <a href="/course/chapter5/3">sezione 3</a>, concateneremo <code>Dataset.shuffle()</code> e <code>Dataset.select()</code> per creare un campione random, e poi zipperemo le colonne <code>html_url</code> e <code>pull_request</code> così da poter paragonare i diversi URL:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->sample = issues_dataset.shuffle(seed=<span class="hljs-number">666</span>).select(<span class="hljs-built_in">range</span>(<span class="hljs-number">3</span>))
<span class="hljs-comment"># Stampiamo le entrate `URL` e `pull_request`</span>
<span class="hljs-keyword">for</span> url, pr <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(sample[<span class="hljs-string">&quot;html_url&quot;</span>], sample[<span class="hljs-string">&quot;pull_request&quot;</span>]):
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;&gt;&gt; URL: <span class="hljs-subst">{url}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;&gt;&gt; Pull request: <span class="hljs-subst">{pr}</span>\n&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; URL: https://github.com/huggingface/datasets/pull/<span class="hljs-number">850</span>
&gt;&gt; Pull request: {<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/pulls/850&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/850&#x27;</span>, <span class="hljs-string">&#x27;diff_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/850.diff&#x27;</span>, <span class="hljs-string">&#x27;patch_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/850.patch&#x27;</span>}
&gt;&gt; URL: https://github.com/huggingface/datasets/issues/<span class="hljs-number">2773</span>
&gt;&gt; Pull request: <span class="hljs-literal">None</span>
&gt;&gt; URL: https://github.com/huggingface/datasets/pull/<span class="hljs-number">783</span>
&gt;&gt; Pull request: {<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/pulls/783&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/783&#x27;</span>, <span class="hljs-string">&#x27;diff_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/783.diff&#x27;</span>, <span class="hljs-string">&#x27;patch_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/783.patch&#x27;</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-196qdbm">Possiamo vedere che ogni richiesta di pull è associata a diversi URL, mentre i comuni issue hanno un’entrata <code>None</code>. Possiamo usare questa distinzione per crare una nuova colonna <code>is_pull_request</code> che controlla se il campo <code>pull_request</code> sia <code>None</code> o meno:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_dataset = issues_dataset.<span class="hljs-built_in">map</span>(
<span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">&quot;is_pull_request&quot;</span>: <span class="hljs-literal">False</span> <span class="hljs-keyword">if</span> x[<span class="hljs-string">&quot;pull_request&quot;</span>] <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">else</span> <span class="hljs-literal">True</span>}
)<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-bcwlrn">✏️ <strong>Prova tu!</strong> Calcola il tempo medio che ci vuole a chiudere un issue su 🤗 Datasets. Potrebbe essere utile usare la funzione <code>Dataset.filter()</code> per eliminare le richieste di pull e gli issue aperti, e puoi usare la funzione <code>Dataset.set_format()</code> per convertire il dataset in un <code>DataFrame</code> così che puoi facilmente manipolare i timestamp <code>created_at</code> e <code>closed_at</code>. Per dei punti bonus, calcola il tempo medio che ci vuole a chiudere le richieste di pull.</p></div> <p data-svelte-h="svelte-1e9tu80">Benché potremmo procedere e pulire ulteriormente il dataset eliminando o rinominando alcune colonne, è solitamente buona prassi lasciare il dataset quando più intatto è possibile in questo stadio, così che può essere utilizzato facilmente in più applicazioni.</p> <p data-svelte-h="svelte-z2nyet">Prima di caricare il nostro dataset sull’Hub Hugging Face, dobbiamo occuparci di una cosa che manca: i commenti associati a ogni issue e richiesta di pull. Hai indovinato, li aggiungeremo utilizzando la REST API di GitHub!</p> <h2 class="relative group"><a id="estendere-il-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#estendere-il-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Estendere il dataset</span></h2> <p data-svelte-h="svelte-ywuirq">Come mostrato negli screenshot di seguito, i commenti associati a un issue o una richiesta di pull offrono una fonte molto ricca di informazioni, soprattutto se siamo interessati a costruire un motore di ricerca per rispondere alle richieste degli utenti sulla libreria.</p> <div class="flex justify-center" data-svelte-h="svelte-1fxxwaz"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-issues-comment.png" alt="Comments associated with an issue about 🤗 Datasets." width="80%"></div> <p data-svelte-h="svelte-q4eegl">La REST API di GitHub offre un <a href="https://docs.github.com/en/rest/reference/issues#list-issue-comments" rel="nofollow">endpoint <code>Comments</code></a> che restituisce tutti i commenti associati con un numero di issue. Testiamo quest’endpoint per vedere cosa restituisce:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issue_number = <span class="hljs-number">2792</span>
url = <span class="hljs-string">f&quot;https://api.github.com/repos/huggingface/datasets/issues/<span class="hljs-subst">{issue_number}</span>/comments&quot;</span>
response = requests.get(url, headers=headers)
response.json()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/comments/897594128&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792#issuecomment-897594128&#x27;</span>,
<span class="hljs-string">&#x27;issue_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792&#x27;</span>,
<span class="hljs-string">&#x27;id&#x27;</span>: <span class="hljs-number">897594128</span>,
<span class="hljs-string">&#x27;node_id&#x27;</span>: <span class="hljs-string">&#x27;IC_kwDODunzps41gDMQ&#x27;</span>,
<span class="hljs-string">&#x27;user&#x27;</span>: {<span class="hljs-string">&#x27;login&#x27;</span>: <span class="hljs-string">&#x27;bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;id&#x27;</span>: <span class="hljs-number">19718818</span>,
<span class="hljs-string">&#x27;node_id&#x27;</span>: <span class="hljs-string">&#x27;MDQ6VXNlcjE5NzE4ODE4&#x27;</span>,
<span class="hljs-string">&#x27;avatar_url&#x27;</span>: <span class="hljs-string">&#x27;https://avatars.githubusercontent.com/u/19718818?v=4&#x27;</span>,
<span class="hljs-string">&#x27;gravatar_id&#x27;</span>: <span class="hljs-string">&#x27;&#x27;</span>,
<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;followers_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/followers&#x27;</span>,
<span class="hljs-string">&#x27;following_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/following{/other_user}&#x27;</span>,
<span class="hljs-string">&#x27;gists_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/gists{/gist_id}&#x27;</span>,
<span class="hljs-string">&#x27;starred_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}&#x27;</span>,
<span class="hljs-string">&#x27;subscriptions_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/subscriptions&#x27;</span>,
<span class="hljs-string">&#x27;organizations_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/orgs&#x27;</span>,
<span class="hljs-string">&#x27;repos_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/repos&#x27;</span>,
<span class="hljs-string">&#x27;events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/events{/privacy}&#x27;</span>,
<span class="hljs-string">&#x27;received_events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/received_events&#x27;</span>,
<span class="hljs-string">&#x27;type&#x27;</span>: <span class="hljs-string">&#x27;User&#x27;</span>,
<span class="hljs-string">&#x27;site_admin&#x27;</span>: <span class="hljs-literal">False</span>},
<span class="hljs-string">&#x27;created_at&#x27;</span>: <span class="hljs-string">&#x27;2021-08-12T12:21:52Z&#x27;</span>,
<span class="hljs-string">&#x27;updated_at&#x27;</span>: <span class="hljs-string">&#x27;2021-08-12T12:31:17Z&#x27;</span>,
<span class="hljs-string">&#x27;author_association&#x27;</span>: <span class="hljs-string">&#x27;CONTRIBUTOR&#x27;</span>,
<span class="hljs-string">&#x27;body&#x27;</span>: <span class="hljs-string">&quot;@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = &#x27;gooaq&#x27;\r\n\r\n def test_load_dataset(self, dataset_name):\r\n configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n&gt; self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n self.parent.assertTrue(len(dataset[split]) &gt; 0)\r\nE AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?&quot;</span>,
<span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>: <span class="hljs-literal">None</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-69hg6q">Possiamo vedere che il commento è archiviato nel campo <code>body</code>, quindi possiamo scvrivere una semplice funzione che restituisce tutti i commenti associati con un issue estraendo i contenuti di <code>body</code> per ogni elemento in <code>response.json()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_comments</span>(<span class="hljs-params">issue_number</span>):
url = <span class="hljs-string">f&quot;https://api.github.com/repos/huggingface/datasets/issues/<span class="hljs-subst">{issue_number}</span>/comments&quot;</span>
response = requests.get(url, headers=headers)
<span class="hljs-keyword">return</span> [r[<span class="hljs-string">&quot;body&quot;</span>] <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> response.json()]
<span class="hljs-comment"># Testiamo la nostra funzione</span>
get_comments(<span class="hljs-number">2792</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&quot;@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = &#x27;gooaq&#x27;\r\n\r\n def test_load_dataset(self, dataset_name):\r\n configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n&gt; self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n self.parent.assertTrue(len(dataset[split]) &gt; 0)\r\nE AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?&quot;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1bt9zz6">Sembra andar bene, quindi possiamo usare <code>Dataset.map()</code> per aggiungere una nuova colonna <code>comments</code> a ogni usse nel nostro dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># A seconda della tua connessione, potrebbe volerci qualche secondo...</span>
issues_with_comments_dataset = issues_dataset.<span class="hljs-built_in">map</span>(
<span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">&quot;comments&quot;</span>: get_comments(x[<span class="hljs-string">&quot;number&quot;</span>])}
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1bdvuzp">Come passaggio finale, salviamo il dataset esteso assieme ai nostri dati non processati, così da poter caricare entrambi sull’Hub:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_with_comments_dataset.to_json(<span class="hljs-string">&quot;issues-datasets-with-comments.jsonl&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="caricare-il-dataset-sullhub-hugging-face" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#caricare-il-dataset-sullhub-hugging-face"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Caricare il dataset sull’Hub Hugging Face</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/HaN6qCr_Afc" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-wrr7o5">Ora che abbiamo il nostro dataset esteso, è arrivato il momento di caricarlo sull’Hub, così da poterlo condividere con la community! Per caricare il dataset useremo la <a href="https://github.com/huggingface/huggingface_hub" rel="nofollow">libreria 🤗 Hub</a>, che ci permette di interagire con l’Hub di Hugging Face attraverso un’API di Python. 🤗 Hub è preinstallato con 🤗 Transformers, così possiamo usarlo da subito. Ad esempio, possiamo usare la funzione <code>list_datastes()</code> per avere informazioni su tutti i dataset pubblici attualmente presenti sull’Hub:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> list_datasets
all_datasets = list_datasets()
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Number of datasets on Hub: <span class="hljs-subst">{<span class="hljs-built_in">len</span>(all_datasets)}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(all_datasets[<span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Number of datasets on Hub: <span class="hljs-number">1487</span>
Dataset Name: acronym_identification, Tags: [<span class="hljs-string">&#x27;annotations_creators:expert-generated&#x27;</span>, <span class="hljs-string">&#x27;language_creators:found&#x27;</span>, <span class="hljs-string">&#x27;languages:en&#x27;</span>, <span class="hljs-string">&#x27;licenses:mit&#x27;</span>, <span class="hljs-string">&#x27;multilinguality:monolingual&#x27;</span>, <span class="hljs-string">&#x27;size_categories:10K&lt;n&lt;100K&#x27;</span>, <span class="hljs-string">&#x27;source_datasets:original&#x27;</span>, <span class="hljs-string">&#x27;task_categories:structure-prediction&#x27;</span>, <span class="hljs-string">&#x27;task_ids:structure-prediction-other-acronym-identification&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1sbkpb8">Possiamo vedere che al momento ci sono circa 1.500 dataset sull’Hub, e la funzione <code>list_datasets()</code> inoltre permette di avere alcuni metadati su ciascuna repository.</p> <p data-svelte-h="svelte-1x1iq75">Per ciò che ci riguarda, la prima cosa che dobbiamo fare è crare una nuova repository nell’Hub. Per far ciò abbiamo bisogno di un token di autentificazione, che pouò essere ottenuto effettuando l’accesso nell’Hub Hugging Face con la funzione <code>notebook_login()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login
notebook_login()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-fkm3k">Questo creerà un widget in cui puoi inserire il tuo username e la tua password, e un token API verrà salvato in <em>~/.huggingface/token</em>. Se stai eseguendo il codice in un terminale, puoi effettuare l’accesso attraverso il comando:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->huggingface-cli login<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1q0q1uc">Una volta fatto questo, possiamo crare una nuova repository con la funzione <code>create_repo()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> create_repo
repo_url = create_repo(name=<span class="hljs-string">&quot;github-issues&quot;</span>, repo_type=<span class="hljs-string">&quot;dataset&quot;</span>)
repo_url<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">&#x27;https://huggingface.co/datasets/lewtun/github-issues&#x27;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ro7sli">In quest’esempio, abbiamo creato una repository vuota chiamata <code>github-issues</code> con l’username <code>lewtun</code> (l’username dovrebbe essere quello del tuo account Hub quando esegui questo codice!).</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-17hauie">✏️ <strong>Prova tu!</strong> Usa le tue credenziali dell’Hub Hugging Face per ottenere un token e creare una repository vuota chiamata <code>github-issues</code>. Ricordati di <strong>non salvere mai le tue credenziali</strong> su Colab o qualunque altra repository, perché potrebbero essere recuperate da malintenzionati.</p></div> <p data-svelte-h="svelte-axrgi">Ora, cloniamo la repository dall’Hub alla nostra macchina e copiamo al suo interno i file del nostro dataset. 🤗 Hub contiene una classe <code>Repository</code> che ha al suo interno molti dei comandi più comuni di Git, per cui per clonare la repository in remoto dobbiamo semplicemente fornire l’URL e il percorso locale in cui desideriamo clonare:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> Repository
repo = Repository(local_dir=<span class="hljs-string">&quot;github-issues&quot;</span>, clone_from=repo_url)
!cp issues-datasets-<span class="hljs-keyword">with</span>-comments.jsonl github-issues/<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1tfh3nd">Di default, diverse estensioni file (ad esempio <em>.bin</em>, <em>.gz</em> e <em>.zip</em>) sono registrate da Git LFS, così che i file di grandi dimensioni possono essere gestiti all’interno dello stesso workflow. Puoi trovare una lista delle estensioni di file monitorati nel file <em>.gitattributes</em> della repository. Per includere il formato JSON Lines a questa lista, possiamo utilizzare il comando:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->repo.lfs_track(<span class="hljs-string">&quot;*.jsonl&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-119nihd">Ora possiamo usare <code>Repository.push_to_hub()</code> per caricare il dataset sull’Hub:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->repo.push_to_hub()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-f7bnod">Se navighiamo fino all’URL contenuto in <code>repo_url</code>, vedremo che il file del nostro dataset è stato caricato.</p> <div class="flex justify-center" data-svelte-h="svelte-18puw29"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/hub-repo.png" alt="Our dataset repository on the Hugging Face Hub." width="80%"></div> <p data-svelte-h="svelte-fxrj9t">Da qui, chiunque può scaricare il dataset semplicemente inserendo l’ID della repository come argomento <code>path</code> di <code>load_dataset()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->remote_dataset = load_dataset(<span class="hljs-string">&quot;lewtun/github-issues&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
remote_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;url&#x27;</span>, <span class="hljs-string">&#x27;repository_url&#x27;</span>, <span class="hljs-string">&#x27;labels_url&#x27;</span>, <span class="hljs-string">&#x27;comments_url&#x27;</span>, <span class="hljs-string">&#x27;events_url&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;id&#x27;</span>, <span class="hljs-string">&#x27;node_id&#x27;</span>, <span class="hljs-string">&#x27;number&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;user&#x27;</span>, <span class="hljs-string">&#x27;labels&#x27;</span>, <span class="hljs-string">&#x27;state&#x27;</span>, <span class="hljs-string">&#x27;locked&#x27;</span>, <span class="hljs-string">&#x27;assignee&#x27;</span>, <span class="hljs-string">&#x27;assignees&#x27;</span>, <span class="hljs-string">&#x27;milestone&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;created_at&#x27;</span>, <span class="hljs-string">&#x27;updated_at&#x27;</span>, <span class="hljs-string">&#x27;closed_at&#x27;</span>, <span class="hljs-string">&#x27;author_association&#x27;</span>, <span class="hljs-string">&#x27;active_lock_reason&#x27;</span>, <span class="hljs-string">&#x27;pull_request&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>, <span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>, <span class="hljs-string">&#x27;is_pull_request&#x27;</span>],
num_rows: <span class="hljs-number">2855</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1woaiq9">Bene, abbiamo caricato il nostro dataset sull’Hub, e può essere utilizzato da tutti! C’è un’altra cosa importante che dobbiamo fare: aggiungere una <em>dataset card</em> che spiega come è stato creato il corpus, e offre altre informazioni utili per la community.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-ttry9d">💡 Puoi caricare un dataset nell’Hub di Hugging Face anche direttamente dal terminale usando <code>huggingface-cli</code> e un po’ di magia Git. La <a href="https://huggingface.co/docs/datasets/share#share-a-dataset-using-the-cli" rel="nofollow">guida a 🤗 Datasets</a> spiega come farlo.</p></div> <h2 class="relative group"><a id="creare-una-dataset-card" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#creare-una-dataset-card"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Creare una dataset card</span></h2> <p data-svelte-h="svelte-vxtenw">I dataset ben-documentati sono più utili agli altri utenti (compreso il futuro te!), poiché spiegano il contesto per permettere agli utenti di decidere se un dataset può essere utile, e valutare gli eventuali bias o rischi associati nell’utilizzo del dataset.</p> <p data-svelte-h="svelte-krhhu0">Sull’Hug di Hugging Face, queste informazioni si trovano nel file <em>README.md</em> della repository. Ci sono due passaggi principali che dovresti seguire prima di creare questo file:</p> <ol data-svelte-h="svelte-12j1wif"><li>Usa l’<a href="https://huggingface.co/datasets/tagging/" rel="nofollow">applicatione <code>datasets-tagging</code></a> per creare tag di metadati in formato YAML. Questi tag sono usato per una serie di funzioni di ricerca sull’Hub di Hugging Face, e assicurano che il tuo dataset possa essere facilmente trovato dai membri della community. Poichè abbiamo creato un nostro dataset, dovrai clonare la repository <code>datasets-tagging</code>, ed eseguire l’applicazione in locale. Ecco com’è l’interfaccia:</li></ol> <div class="flex justify-center" data-svelte-h="svelte-1gqifn5"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-tagger.png" alt="The 'datasets-tagging' interface." width="80%"></div> <ol start="2" data-svelte-h="svelte-16lcu59"><li>Leggi la <a href="https://github.com/huggingface/datasets/blob/master/templates/README_guide.md" rel="nofollow">guida 🤗 Datasets</a> sulla creazione di dataset card informative, e usala come template.</li></ol> <p data-svelte-h="svelte-1qxc0rz">Puoi creare il file <em>README.md</em> direttamente sull’Hub, e puoi trovare un modello per una dataset card nella repository <code>lewtun/github-issues</code>. Di seguito è mostrato uno screenshot di una dataset card già compilata.</p> <div class="flex justify-center" data-svelte-h="svelte-ct1wn8"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/dataset-card.png" alt="A dataset card." width="80%"></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1iu8kwm">✏️ <strong>Prova tu!</strong> Usa l’applicazione <code>dataset-tagging</code> e la <a href="https://github.com/huggingface/datasets/blob/master/templates/README_guide.md" rel="nofollow">guida 🤗 Datasets</a> per completare il file <em>README.md</em> per il tuo dataset di issue di GitHub.</p></div> <p data-svelte-h="svelte-kroznp">È tutto! Abbiamo visto in questa sezione che creare un buon dataset può essere un’impresa, ma per fortuna caricarlo e condividerlo con la community è molto più semplice. Nella prossima sezione useremo il nostro nuovo dataset per creare un motore di ricerca semantico con 🤗 Datasets, che abbina alle domande gli issue e i commenti più rilevanti.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-aozvfr">✏️ <strong>Prova tu!</strong> Segui i passi che abbiamo eseguito in questa sezione per creare un dataset di issue GitHub per la tua libreria open source preferita (ovviamente scegli qualcosa di diverso da 🤗 Datasets!). Per punti bonus, esegui il fine-tuning di un classificatore multiclasse per predirre i tag presenti nel campo <code>labels</code>.</p></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/it/chapter5/5.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_u8ez91 = {
assets: "/docs/course/pr_1069/it",
base: "/docs/course/pr_1069/it",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1069/it/_app/immutable/entry/start.693d748d.js"),
import("/docs/course/pr_1069/it/_app/immutable/entry/app.e9cfd099.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 37],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
99.4 kB
·
Xet hash:
68941e68dbe6b27c217500a3d417a62d4beb763ecbb5dedbb926f26ec3ce0635

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.