Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Fare il debug della training pipeline","local":"fare-il-debug-della-training-pipeline","sections":[{"title":"Fare il debug della training pipeline","local":"fare-il-debug-della-training-pipeline","sections":[{"title":"Controlla i dati","local":"controlla-i-dati","sections":[],"depth":3},{"title":"Dai dataset ai dataloader","local":"dai-dataset-ai-dataloader","sections":[],"depth":3},{"title":"Passaggio attraverso il modello","local":"passaggio-attraverso-il-modello","sections":[],"depth":3},{"title":"Esecuzione di un passaggio di ottimizzazione","local":"esecuzione-di-un-passaggio-di-ottimizzazione","sections":[],"depth":3},{"title":"Come gestire gli errori out-of-memory di CUDA","local":"come-gestire-gli-errori-out-of-memory-di-cuda","sections":[],"depth":3},{"title":"Valutazione del modello","local":"valutazione-del-modello","sections":[],"depth":3}],"depth":2},{"title":"Debug degli errori silenziosi durante l’addestramento","local":"debug-degli-errori-silenziosi-durante-laddestramento","sections":[{"title":"Controllare i dati (di nuovo!)","local":"controllare-i-dati-di-nuovo","sections":[],"depth":3},{"title":"Fare overfitting del modello su un batch","local":"fare-overfitting-del-modello-su-un-batch","sections":[],"depth":3},{"title":"Non calibrare niente prima di avere una prima baseline","local":"non-calibrare-niente-prima-di-avere-una-prima-baseline","sections":[],"depth":3},{"title":"Chiedere aiuto","local":"chiedere-aiuto","sections":[],"depth":3}],"depth":2}],"depth":1}"> | |
| <link href="/docs/course/pr_1069/it/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/entry/start.693d748d.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/scheduler.37c15a92.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/singletons.60b4c7a2.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/index.18351ede.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/paths.43b6516c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/entry/app.e9cfd099.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/index.2bf4358c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/nodes/0.bb8a536c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/nodes/44.fc57a1f7.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/Tip.363c041f.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/Youtube.1e50a667.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/CodeBlock.4e987730.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/CourseFloatingBanner.9ff4c771.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/DocNotebookDropdown.efc1fb7c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/FrameworkSwitchCourse.8d4d4ab6.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/it/_app/immutable/chunks/getInferenceSnippets.24b50994.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Fare il debug della training pipeline","local":"fare-il-debug-della-training-pipeline","sections":[{"title":"Fare il debug della training pipeline","local":"fare-il-debug-della-training-pipeline","sections":[{"title":"Controlla i dati","local":"controlla-i-dati","sections":[],"depth":3},{"title":"Dai dataset ai dataloader","local":"dai-dataset-ai-dataloader","sections":[],"depth":3},{"title":"Passaggio attraverso il modello","local":"passaggio-attraverso-il-modello","sections":[],"depth":3},{"title":"Esecuzione di un passaggio di ottimizzazione","local":"esecuzione-di-un-passaggio-di-ottimizzazione","sections":[],"depth":3},{"title":"Come gestire gli errori out-of-memory di CUDA","local":"come-gestire-gli-errori-out-of-memory-di-cuda","sections":[],"depth":3},{"title":"Valutazione del modello","local":"valutazione-del-modello","sections":[],"depth":3}],"depth":2},{"title":"Debug degli errori silenziosi durante l’addestramento","local":"debug-degli-errori-silenziosi-durante-laddestramento","sections":[{"title":"Controllare i dati (di nuovo!)","local":"controllare-i-dati-di-nuovo","sections":[],"depth":3},{"title":"Fare overfitting del modello su un batch","local":"fare-overfitting-del-modello-su-un-batch","sections":[],"depth":3},{"title":"Non calibrare niente prima di avere una prima baseline","local":"non-calibrare-niente-prima-di-avere-una-prima-baseline","sections":[],"depth":3},{"title":"Chiedere aiuto","local":"chiedere-aiuto","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="bg-white leading-none border border-gray-100 rounded-lg flex p-0.5 w-56 text-sm mb-4"><a class="flex justify-center flex-1 py-1.5 px-2.5 focus:outline-none !no-underline rounded-l bg-red-50 dark:bg-transparent text-red-600" href="?fw=pt"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><defs><clipPath id="a"><rect x="3.05" y="0.5" width="25.73" height="31" fill="none"></rect></clipPath></defs><g clip-path="url(#a)"><path d="M24.94,9.51a12.81,12.81,0,0,1,0,18.16,12.68,12.68,0,0,1-18,0,12.81,12.81,0,0,1,0-18.16l9-9V5l-.84.83-6,6a9.58,9.58,0,1,0,13.55,0ZM20.44,9a1.68,1.68,0,1,1,1.67-1.67A1.68,1.68,0,0,1,20.44,9Z" fill="#ee4c2c"></path></g></svg> Pytorch </a><a class="flex justify-center flex-1 py-1.5 px-2.5 focus:outline-none !no-underline rounded-r text-gray-500 filter grayscale" href="?fw=tf"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="0.94em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 274"><path d="M145.726 42.065v42.07l72.861 42.07v-42.07l-72.86-42.07zM0 84.135v42.07l36.43 21.03V105.17L0 84.135zm109.291 21.035l-36.43 21.034v126.2l36.43 21.035v-84.135l36.435 21.035v-42.07l-36.435-21.034V105.17z" fill="#E55B2D"></path><path d="M145.726 42.065L36.43 105.17v42.065l72.861-42.065v42.065l36.435-21.03v-84.14zM255.022 63.1l-36.435 21.035v42.07l36.435-21.035V63.1zm-72.865 84.135l-36.43 21.035v42.07l36.43-21.036v-42.07zm-36.43 63.104l-36.436-21.035v84.135l36.435-21.035V210.34z" fill="#ED8E24"></path><path d="M145.726 0L0 84.135l36.43 21.035l109.296-63.105l72.861 42.07L255.022 63.1L145.726 0zm0 126.204l-36.435 21.03l36.435 21.036l36.43-21.035l-36.43-21.03z" fill="#F8BF3C"></path></svg> TensorFlow </a></div> <h1 class="relative group"><a id="fare-il-debug-della-training-pipeline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fare-il-debug-della-training-pipeline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Fare il debug della training pipeline</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-8-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/it/chapter8/section4.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/it/chapter8/section4.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-oons04">Hai scritto un bello script per addestrare o affinare un modello su un determinato compito, seguendo scrupolosamente i consigli del <a href="/course/chapter7">Capitolo 7</a>. Ma quando lanci il comando <code>trainer.train()</code>, succede qualcosa di orribile: si ottiene un errore 😱! O peggio, tutto sembra andare bene e il training viene eseguito senza errori, ma il modello che ne risulta fa schifo. In questa sezione mostreremo cosa è possibile fare per eseguire il debug di questo tipo di problemi.</p> <h2 class="relative group"><a id="fare-il-debug-della-training-pipeline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fare-il-debug-della-training-pipeline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Fare il debug della training pipeline</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/L-WSwUWde1U" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-yuehgd">Il problema quando si ha un errore da <code>trainer.train()</code> è che potrebbe provenire da più fonti, poiché il <code>Trainer</code> di solito mette insieme molte cose. Converte i dataset in <em>dataloader</em>, quindi l’errore potrebbe essere dato da qualcosa di sbagliato nel dataset stesso, o da un problema qualche problema nel provare a raggruppare in un batch elementi del dataset. Poi prende un batch di dati e lo invia al modello, quindi il problema potrebbe anche essere nel codice del modello. Successivamente, calcola i gradienti ed esegue la fase di ottimizzazione, quindi il problema potrebbe essere nel tuo <em>optimizer</em>. E anche se tutto va bene per il training, qualcosa potrebbe andare storto durante la valutazione se c’è un problema con la metrica selezionata.</p> <p data-svelte-h="svelte-1vfby3">Il modo migliore per eseguire il debug di un errore che si verifica in <code>trainer.train()</code> è quello di esaminare manualmente l’intera pipeline per vedere dove le cose sono andate storte. L’errore è spesso molto facile da risolvere.</p> <p data-svelte-h="svelte-n2uzsr">Per dimostrarlo, useremo il seguente script che ha lo scopo di affinare un modello DistilBERT sul <a href="https://huggingface.co/datasets/glue" rel="nofollow">dataset MNLI</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset, load_metric | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| TrainingArguments, | |
| Trainer, | |
| ) | |
| raw_datasets = load_dataset(<span class="hljs-string">"glue"</span>, <span class="hljs-string">"mnli"</span>) | |
| model_checkpoint = <span class="hljs-string">"distilbert-base-uncased"</span> | |
| tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">preprocess_function</span>(<span class="hljs-params">examples</span>): | |
| <span class="hljs-keyword">return</span> tokenizer(examples[<span class="hljs-string">"premise"</span>], examples[<span class="hljs-string">"hypothesis"</span>], truncation=<span class="hljs-literal">True</span>) | |
| tokenized_datasets = raw_datasets.<span class="hljs-built_in">map</span>(preprocess_function, batched=<span class="hljs-literal">True</span>) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) | |
| args = TrainingArguments( | |
| <span class="hljs-string">f"distilbert-finetuned-mnli"</span>, | |
| evaluation_strategy=<span class="hljs-string">"epoch"</span>, | |
| save_strategy=<span class="hljs-string">"epoch"</span>, | |
| learning_rate=<span class="hljs-number">2e-5</span>, | |
| num_train_epochs=<span class="hljs-number">3</span>, | |
| weight_decay=<span class="hljs-number">0.01</span>, | |
| ) | |
| metric = load_metric(<span class="hljs-string">"glue"</span>, <span class="hljs-string">"mnli"</span>) | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_metrics</span>(<span class="hljs-params">eval_pred</span>): | |
| predictions, labels = eval_pred | |
| <span class="hljs-keyword">return</span> metric.compute(predictions=predictions, references=labels) | |
| trainer = Trainer( | |
| model, | |
| args, | |
| train_dataset=raw_datasets[<span class="hljs-string">"train"</span>], | |
| eval_dataset=raw_datasets[<span class="hljs-string">"validation_matched"</span>], | |
| compute_metrics=compute_metrics, | |
| ) | |
| trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1142yq6">Se provi a eseguirlo, otterrai un errore piuttosto criptico:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">'ValueError: You have to specify either input_ids or inputs_embeds'</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="controlla-i-dati" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#controlla-i-dati"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Controlla i dati</span></h3> <p data-svelte-h="svelte-d33wq2">Non c’è bisogno di dirlo, ma se i dati sono corrotti, il <code>Trainer</code> non sarà in grado di formare i batch e tanto meno di addestrare il modello. Quindi, per prima cosa, è necessario dare un’occhiata a cosa c’è nel training set(<em>insieme di addestramento</em>).</p> <p data-svelte-h="svelte-135ov84">Per evitare di passare infinite ore a cercare di risolvere qualcosa che non è la fonte del bug, consigliamo di usare <code>trainer.train_dataset</code> per controllare l’insieme di dati e nient’altro. Quindi facciamo così:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trainer.train_dataset[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">'hypothesis'</span>: <span class="hljs-string">'Product and geography are what make cream skimming work. '</span>, | |
| <span class="hljs-string">'idx'</span>: <span class="hljs-number">0</span>, | |
| <span class="hljs-string">'label'</span>: <span class="hljs-number">1</span>, | |
| <span class="hljs-string">'premise'</span>: <span class="hljs-string">'Conceptually cream skimming has two basic dimensions - product and geography.'</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-cqtrap">Hai notato qualcosa di sbagliato? Questo, insieme al messaggio di errore sulla mancanza di <code>input_ids</code>, dovrebbe farci capire che qui abbiamo testo, non numeri che invece il modello può interpretare. In questo caso, l’errore originale è molto fuorviante, perché il <code>Trainer</code> rimuove automaticamente le colonne che non corrispondono alla firma del modello (cioè i parametri che il modello si aspetta). Ciò significa che in questo caso tutto, a parte <em>label</em>, è stato scartato. Non c’è stato quindi nessun problema nel creare i batch di dati e poi inviarli al modello, invece è il modello che a sua volta si è lamentato di non aver ricevuto l’input corretto.</p> <p data-svelte-h="svelte-1u5e5w1">Perché i dati non sono stati processati? Abbiamo usato il metodo <code>Dataset.map()</code> sui set di dati per applicare il tokenizer a ogni campione. Ma se si osserva attentamente il codice, si noterà che abbiamo commesso un errore nel passare i training set e il validation set (<em>insieme di valutazione</em>) al <code>Trainer</code>. Qui invece di usare <code>tokenized_datasets</code>, abbiamo usato <code>raw_datasets</code> 🤦. Quindi correggiamo questo errore!</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset, load_metric | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| TrainingArguments, | |
| Trainer, | |
| ) | |
| raw_datasets = load_dataset(<span class="hljs-string">"glue"</span>, <span class="hljs-string">"mnli"</span>) | |
| model_checkpoint = <span class="hljs-string">"distilbert-base-uncased"</span> | |
| tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">preprocess_function</span>(<span class="hljs-params">examples</span>): | |
| <span class="hljs-keyword">return</span> tokenizer(examples[<span class="hljs-string">"premise"</span>], examples[<span class="hljs-string">"hypothesis"</span>], truncation=<span class="hljs-literal">True</span>) | |
| tokenized_datasets = raw_datasets.<span class="hljs-built_in">map</span>(preprocess_function, batched=<span class="hljs-literal">True</span>) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) | |
| args = TrainingArguments( | |
| <span class="hljs-string">f"distilbert-finetuned-mnli"</span>, | |
| evaluation_strategy=<span class="hljs-string">"epoch"</span>, | |
| save_strategy=<span class="hljs-string">"epoch"</span>, | |
| learning_rate=<span class="hljs-number">2e-5</span>, | |
| num_train_epochs=<span class="hljs-number">3</span>, | |
| weight_decay=<span class="hljs-number">0.01</span>, | |
| ) | |
| metric = load_metric(<span class="hljs-string">"glue"</span>, <span class="hljs-string">"mnli"</span>) | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_metrics</span>(<span class="hljs-params">eval_pred</span>): | |
| predictions, labels = eval_pred | |
| <span class="hljs-keyword">return</span> metric.compute(predictions=predictions, references=labels) | |
| trainer = Trainer( | |
| model, | |
| args, | |
| train_dataset=tokenized_datasets[<span class="hljs-string">"train"</span>], | |
| eval_dataset=tokenized_datasets[<span class="hljs-string">"validation_matched"</span>], | |
| compute_metrics=compute_metrics, | |
| ) | |
| trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1f2dpvs">Questo nuovo codice ora darà un errore diverso (un miglioramento!):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">'ValueError: expected sequence of length 43 at dim 1 (got 37)'</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-10dlgl9">Osservando il traceback, si nota che l’errore si verifica nel punto in cui i dati vengono raccolti:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->~/git/transformers/src/transformers/data/data_collator.py <span class="hljs-keyword">in</span> torch_default_data_collator(features) | |
| <span class="hljs-number">105</span> batch[k] = torch.stack([f[k] <span class="hljs-keyword">for</span> f <span class="hljs-keyword">in</span> features]) | |
| <span class="hljs-number">106</span> <span class="hljs-keyword">else</span>: | |
| --> <span class="hljs-number">107</span> batch[k] = torch.tensor([f[k] <span class="hljs-keyword">for</span> f <span class="hljs-keyword">in</span> features]) | |
| <span class="hljs-number">108</span> | |
| <span class="hljs-number">109</span> <span class="hljs-keyword">return</span> batch<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1smbcnr">Quindi, bisogna concentrarsi su questo. Prima di farlo, però, finiamo d’ispezionare i nostri dati, per essere sicuri al 100% che siano corretti.</p> <p data-svelte-h="svelte-1qi5643">Una cosa da fare sempre quando si esegue il debug di una sessione di addestramento è dare un’occhiata agli input del modello decodificati. Non possiamo dare un senso ai numeri che gli diamo direttamente in pasto, quindi dobbiamo guardare cosa rappresentano quei numeri. Nella computer vision, ad esempio, ciò significa guardare le immagini decodificate dei pixel passati, nel campo del riconoscimento vocale significa ascoltare i campioni audio decodificati e per il nostro esempio di NLP significa usare il nostro tokenizer per decodificare gli input:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.decode(trainer.train_dataset[<span class="hljs-number">0</span>][<span class="hljs-string">"input_ids"</span>])<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">'[CLS] conceptually cream skimming has two basic dimensions - product and geography. [SEP] product and geography are what make cream skimming work. [SEP]'</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-2lxf0o">Questo sembra corretto. Si dovrebbe fare così per tutte le chiavi degli input:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trainer.train_dataset[<span class="hljs-number">0</span>].keys()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->dict_keys([<span class="hljs-string">'attention_mask'</span>, <span class="hljs-string">'hypothesis'</span>, <span class="hljs-string">'idx'</span>, <span class="hljs-string">'input_ids'</span>, <span class="hljs-string">'label'</span>, <span class="hljs-string">'premise'</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-p2ursw">Si noti che le chiavi che non corrispondono a input accettati dal modello saranno automaticamente scartate, quindi qui terremo solo <code>input_ids</code>, <code>attention_mask</code> e <code>label</code> (che sarà rinominata <code>labels</code>). Per ricontrollare la firma del modello, si può stampare la classe del modello e poi controllare la sua documentazione:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">type</span>(trainer.model)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-v4xkk">Quindi, nel nostro caso, possiamo controllare i parametri accettati in <a href="https://huggingface.co/transformers/model_doc/distilbert.html#distilbertforsequenceclassification" rel="nofollow">questa pagina</a>. Il <code>Trainer</code> registrerà anche le colonne che sta scartando.</p> <p data-svelte-h="svelte-1104rpz">Abbiamo controllato che gli ID in ingresso siano corretti decodificandoli. Il prossimo passo è la <code>attention_mask</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trainer.train_dataset[<span class="hljs-number">0</span>][<span class="hljs-string">"attention_mask"</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-16pihx1">Poiché non abbiamo applicato il padding nel nostro preprocessing, questo sembra perfettamente naturale. Per essere sicuri che non ci siano problemi con la attention mask (<em>maschera di attenzione</em>), controlliamo che sia della stessa lunghezza dei nostri ID di input:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">len</span>(trainer.train_dataset[<span class="hljs-number">0</span>][<span class="hljs-string">"attention_mask"</span>]) == <span class="hljs-built_in">len</span>( | |
| trainer.train_dataset[<span class="hljs-number">0</span>][<span class="hljs-string">"input_ids"</span>] | |
| )<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-literal">True</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1cfu9nl">Bene! Infine, controlliamo la nostra label:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trainer.train_dataset[<span class="hljs-number">0</span>][<span class="hljs-string">"label"</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">1</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-12oyv9w">Come gli ID degli input, si tratta di un numero che non ha senso di per sé. Come abbiamo visto prima, la mappa tra gli interi e i nomi delle label è memorizzata all’interno dell’attributo <code>names</code> della corrispondente <em>feature</em> del dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trainer.train_dataset.features[<span class="hljs-string">"label"</span>].names<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">'entailment'</span>, <span class="hljs-string">'neutral'</span>, <span class="hljs-string">'contradiction'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ph2bbc">Quindi <code>1</code> significa <code>neutral</code> (<em>neutro</em>), il che significa che le due frasi viste sopra non sono in contraddizione e che la prima non implica la seconda. Sembra corretto!</p> <p data-svelte-h="svelte-io8wo7">Non abbiamo token type ID (<em>ID del tipo di token</em>) qui, perché DistilBERT non li prevede; se li hai nel tuo modello, devi anche assicurarti che corrispondano correttamente alla posizione della prima e della seconda frase nell’input.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-4bjsze">✏️ <strong>Prova tu!</strong> Controlla che tutto sia corretto nel secondo elemento del training set.</p></div> <p data-svelte-h="svelte-1f8l9un">In questo caso, il controllo viene effettuato solo sul training set, ma è necessario ricontrollare allo stesso modo anche il validation set e il test set.</p> <p data-svelte-h="svelte-1wcpo2k">Ora che sappiamo che i nostri set di dati sono corretti, è il momento di verificare la fase successiva della pipeline di addestramento.</p> <h3 class="relative group"><a id="dai-dataset-ai-dataloader" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#dai-dataset-ai-dataloader"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Dai dataset ai dataloader</span></h3> <p data-svelte-h="svelte-1qbf1v6">La prossima cosa che può andare storta nella pipeline di addestramento è quando il <code>Trainer</code> cerca di formare dei batch dal training o dal validation set. Una volta che si è sicuri che i set di dati del <code>Trainer</code> sono corretti, si può provare a formare manualmente un batch eseguendo quanto segue (sostituire <code>train</code> con <code>eval</code> per il dataloader di validazione):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> batch <span class="hljs-keyword">in</span> trainer.get_train_dataloader(): | |
| <span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-cm85vj">Questo codice crea il training dataloader (<em>caricatore di dati di addestramento</em>), quindi lo itera, fermandosi alla prima iterazione. Se il codice viene eseguito senza errori, si ha il primo batch di addestramento che può essere ispezionato; se il codice dà errore, si sa con certezza che il problema è nel dataloader, come in questo caso:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->~/git/transformers/src/transformers/data/data_collator.py <span class="hljs-keyword">in</span> torch_default_data_collator(features) | |
| <span class="hljs-number">105</span> batch[k] = torch.stack([f[k] <span class="hljs-keyword">for</span> f <span class="hljs-keyword">in</span> features]) | |
| <span class="hljs-number">106</span> <span class="hljs-keyword">else</span>: | |
| --> <span class="hljs-number">107</span> batch[k] = torch.tensor([f[k] <span class="hljs-keyword">for</span> f <span class="hljs-keyword">in</span> features]) | |
| <span class="hljs-number">108</span> | |
| <span class="hljs-number">109</span> <span class="hljs-keyword">return</span> batch | |
| ValueError: expected sequence of length <span class="hljs-number">45</span> at dim <span class="hljs-number">1</span> (got <span class="hljs-number">76</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1wuui5u">L’ispezione dell’ultimo frame del traceback dovrebbe essere sufficiente a fornire un indizio, ma cerchiamo di scavare un po’ più a fondo. La maggior parte dei problemi durante la creazione dei batch si verifica a causa del raggruppamento degli esempi in un singolo batch, quindi la prima cosa da controllare in caso di dubbio è quale <code>collate_fn</code> il tuo <code>DataLoader</code> sta usando:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->data_collator = trainer.get_train_dataloader().collate_fn | |
| data_collator<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><function transformers.data.data_collator.default_data_collator(features: <span class="hljs-type">List</span>[InputDataClass], return_tensors=<span class="hljs-string">'pt'</span>) -> <span class="hljs-type">Dict</span>[<span class="hljs-built_in">str</span>, <span class="hljs-type">Any</span>]><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-dg70m6">È il <code>default_data_collator</code>, ma non è quello che vogliamo in questo caso. Vogliamo che i nostri esempi siano espansi fino ad essere come la frase più lunga del batch, cosa che viene fatta dal collettore <code>DataCollatorWithPadding</code>. Questo collatore di dati dovrebbe essere usato di default da <code>Trainer</code>, quindi perché non viene usato qui?</p> <p data-svelte-h="svelte-1azvl87">La risposta è che non abbiamo passato il <code>tokenizer</code> al <code>Trainer</code>, quindi non ha potuto creare il <code>DataCollatorWithPadding</code> che volevamo. In pratica, non si dovrebbe mai esitare a passare esplicitamente il collettore di dati che si vuole usare, per essere sicuri di evitare questo tipo di errori. Adattiamo il nostro codice per fare esattamente questo:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset, load_metric | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| DataCollatorWithPadding, | |
| TrainingArguments, | |
| Trainer, | |
| ) | |
| raw_datasets = load_dataset(<span class="hljs-string">"glue"</span>, <span class="hljs-string">"mnli"</span>) | |
| model_checkpoint = <span class="hljs-string">"distilbert-base-uncased"</span> | |
| tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">preprocess_function</span>(<span class="hljs-params">examples</span>): | |
| <span class="hljs-keyword">return</span> tokenizer(examples[<span class="hljs-string">"premise"</span>], examples[<span class="hljs-string">"hypothesis"</span>], truncation=<span class="hljs-literal">True</span>) | |
| tokenized_datasets = raw_datasets.<span class="hljs-built_in">map</span>(preprocess_function, batched=<span class="hljs-literal">True</span>) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) | |
| args = TrainingArguments( | |
| <span class="hljs-string">f"distilbert-finetuned-mnli"</span>, | |
| evaluation_strategy=<span class="hljs-string">"epoch"</span>, | |
| save_strategy=<span class="hljs-string">"epoch"</span>, | |
| learning_rate=<span class="hljs-number">2e-5</span>, | |
| num_train_epochs=<span class="hljs-number">3</span>, | |
| weight_decay=<span class="hljs-number">0.01</span>, | |
| ) | |
| metric = load_metric(<span class="hljs-string">"glue"</span>, <span class="hljs-string">"mnli"</span>) | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_metrics</span>(<span class="hljs-params">eval_pred</span>): | |
| predictions, labels = eval_pred | |
| <span class="hljs-keyword">return</span> metric.compute(predictions=predictions, references=labels) | |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
| trainer = Trainer( | |
| model, | |
| args, | |
| train_dataset=tokenized_datasets[<span class="hljs-string">"train"</span>], | |
| eval_dataset=tokenized_datasets[<span class="hljs-string">"validation_matched"</span>], | |
| compute_metrics=compute_metrics, | |
| data_collator=data_collator, | |
| tokenizer=tokenizer, | |
| ) | |
| trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-o804dd">La buona notizia? Non riceviamo più lo stesso errore di prima, il che è sicuramente un miglioramento. La cattiva notizia? Otteniamo invece un famigerato errore CUDA:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-11eb86j">Questo è un male perché gli errori di CUDA sono estremamente difficili da debuggare in generale. Vedremo tra poco come risolvere questo problema, ma prima terminiamo l’analisi della creazione di batch.</p> <p data-svelte-h="svelte-c3yr03">Se siete sicuri che il tuo collettore di dati è quello giusto, dovresti provare ad applicarlo su un paio di campioni del tuo set di dati:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->data_collator = trainer.get_train_dataloader().collate_fn | |
| batch = data_collator([trainer.train_dataset[i] <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">4</span>)])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-10f6kik">Questo codice fallirà perché il <code>train_dataset</code> contiene colonne di tipo stringa, che il <code>Trainer</code> solitamente rimuove. È possibile rimuoverle manualmente o, se si vuole replicare esattamente ciò che il <code>Trainer</code> fa dietro le quinte, si può chiamare il metodo privato <code>Trainer._remove_unused_columns()</code> che fa questo:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->data_collator = trainer.get_train_dataloader().collate_fn | |
| actual_train_set = trainer._remove_unused_columns(trainer.train_dataset) | |
| batch = data_collator([actual_train_set[i] <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">4</span>)])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qy836m">Se l’errore persiste, si potrebbe eseguire manualmente il debug di ciò che accade all’interno del collettore di dati.</p> <p data-svelte-h="svelte-1jxb9re">Ora che abbiamo eseguito il debug del processo di creazione del batch, è il momento di passarne uno attraverso il modello!</p> <h3 class="relative group"><a id="passaggio-attraverso-il-modello" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#passaggio-attraverso-il-modello"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Passaggio attraverso il modello</span></h3> <p data-svelte-h="svelte-1jo2f0k">Dovrebbe essere possibile ottenere un batch eseguendo il seguente comando:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> batch <span class="hljs-keyword">in</span> trainer.get_train_dataloader(): | |
| <span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1nf53su">Se si esegue questo codice in un notebook, è possibile che si verifichi un errore CUDA simile a quello visto in precedenza, nel qual caso è necessario riavviare il notebook e rieseguire l’ultimo snippet senza la riga <code>trainer.train()</code>. Questa è la seconda cosa più fastidiosa degli errori CUDA: rompono irrimediabilmente il kernel. La cosa più fastidiosa è che sono difficili da debuggare.</p> <p data-svelte-h="svelte-1m58vxq">Perché? Questo ha a che fare con il modo in cui funzionano le GPU. Sono estremamente efficienti nell’eseguire molte operazioni in parallelo, ma l’inconveniente è che quando una di queste istruzioni produce un errore, non lo si sa immediatamente. È solo quando il programma chiama una sincronizzazione dei processi multipli sulla GPU che esso si accorge che qualcosa è andato storto, quindi l’errore viene effettivamente sollevato in un punto che non ha niente a che fare con ciò che lo ha creato. Per esempio, se guardiamo il nostro traceback precedente, l’errore è stato sollevato durante il backward pass (<em>percorso discendente</em>), ma vedremo tra un minuto che in realtà deriva da qualcosa nel forward pass (<em>percorso ascendente</em>).</p> <p data-svelte-h="svelte-1l13w1c">Come si fa a fare il debug di questi errori? La risposta è semplice: non lo facciamo. A meno che l’errore CUDA non sia un errore out-of-memory (il che significa che la memoria della GPU non è sufficiente), si dovrebbe sempre tornare alla CPU per eseguire il debug.</p> <p data-svelte-h="svelte-1o77zbp">Per fare questo nel nostro caso, dobbiamo semplicemente rimettere il modello sulla CPU e chiamarlo sul nostro batch — il batch restituito dal <code>DataLoader</code> non è ancora stato spostato sulla GPU:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->outputs = trainer.model.cpu()(**batch)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->~/.pyenv/versions/<span class="hljs-number">3.7</span><span class="hljs-number">.9</span>/envs/base/lib/python3<span class="hljs-number">.7</span>/site-packages/torch/nn/functional.py <span class="hljs-keyword">in</span> nll_loss(<span class="hljs-built_in">input</span>, target, weight, size_average, ignore_index, reduce, reduction) | |
| <span class="hljs-number">2386</span> ) | |
| <span class="hljs-number">2387</span> <span class="hljs-keyword">if</span> dim == <span class="hljs-number">2</span>: | |
| -> <span class="hljs-number">2388</span> ret = torch._C._nn.nll_loss(<span class="hljs-built_in">input</span>, target, weight, _Reduction.get_enum(reduction), ignore_index) | |
| <span class="hljs-number">2389</span> <span class="hljs-keyword">elif</span> dim == <span class="hljs-number">4</span>: | |
| <span class="hljs-number">2390</span> ret = torch._C._nn.nll_loss2d(<span class="hljs-built_in">input</span>, target, weight, _Reduction.get_enum(reduction), ignore_index) | |
| IndexError: Target <span class="hljs-number">2</span> <span class="hljs-keyword">is</span> out of bounds.<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7af71d">Quindi, il quadro si fa più chiaro. Invece di avere un errore CUDA, ora abbiamo un <code>IndexError</code> nel calcolo della loss (<em>funzione di perdita</em>) (quindi niente a che fare con il backward pass, come abbiamo detto prima). Più precisamente, possiamo vedere che è il target 2 a creare l’errore, quindi questo è un ottimo momento per controllare il numero di label del nostro modello:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trainer.model.config.num_labels<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">2</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-r6vhnq">Con due label, solo gli 0 e gli 1 sono ammessi come target, ma secondo il messaggio di errore abbiamo ottenuto un 2. Ottenere un 2 è in realtà normale: se ricordiamo i nomi delle etichette che abbiamo estratto in precedenza, ce n’erano tre, quindi abbiamo gli indici 0, 1 e 2 nel nostro dataset. Il problema è che non l’abbiamo detto al nostro modello, il quale si sarebbe dovuto creare con tre label. Quindi, risolviamo il problema!</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset, load_metric | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| DataCollatorWithPadding, | |
| TrainingArguments, | |
| Trainer, | |
| ) | |
| raw_datasets = load_dataset(<span class="hljs-string">"glue"</span>, <span class="hljs-string">"mnli"</span>) | |
| model_checkpoint = <span class="hljs-string">"distilbert-base-uncased"</span> | |
| tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">preprocess_function</span>(<span class="hljs-params">examples</span>): | |
| <span class="hljs-keyword">return</span> tokenizer(examples[<span class="hljs-string">"premise"</span>], examples[<span class="hljs-string">"hypothesis"</span>], truncation=<span class="hljs-literal">True</span>) | |
| tokenized_datasets = raw_datasets.<span class="hljs-built_in">map</span>(preprocess_function, batched=<span class="hljs-literal">True</span>) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=<span class="hljs-number">3</span>) | |
| args = TrainingArguments( | |
| <span class="hljs-string">f"distilbert-finetuned-mnli"</span>, | |
| evaluation_strategy=<span class="hljs-string">"epoch"</span>, | |
| save_strategy=<span class="hljs-string">"epoch"</span>, | |
| learning_rate=<span class="hljs-number">2e-5</span>, | |
| num_train_epochs=<span class="hljs-number">3</span>, | |
| weight_decay=<span class="hljs-number">0.01</span>, | |
| ) | |
| metric = load_metric(<span class="hljs-string">"glue"</span>, <span class="hljs-string">"mnli"</span>) | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_metrics</span>(<span class="hljs-params">eval_pred</span>): | |
| predictions, labels = eval_pred | |
| <span class="hljs-keyword">return</span> metric.compute(predictions=predictions, references=labels) | |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
| trainer = Trainer( | |
| model, | |
| args, | |
| train_dataset=tokenized_datasets[<span class="hljs-string">"train"</span>], | |
| eval_dataset=tokenized_datasets[<span class="hljs-string">"validation_matched"</span>], | |
| compute_metrics=compute_metrics, | |
| data_collator=data_collator, | |
| tokenizer=tokenizer, | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-utmeky">Non abbiamo ancora incluso la riga <code>trainer.train()</code>, per prendere tempo e verificare che tutto sia a posto. Se richiediamo un batch e lo passiamo al nostro modello, ora funziona senza errori!</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> batch <span class="hljs-keyword">in</span> trainer.get_train_dataloader(): | |
| <span class="hljs-keyword">break</span> | |
| outputs = trainer.model.cpu()(**batch)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-dq73ex">Il passo successivo consiste nel tornare a usare la GPU e verificare che tutto funzioni ancora:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| device = torch.device(<span class="hljs-string">"cuda"</span>) <span class="hljs-keyword">if</span> torch.cuda.is_available() <span class="hljs-keyword">else</span> torch.device(<span class="hljs-string">"cpu"</span>) | |
| batch = {k: v.to(device) <span class="hljs-keyword">for</span> k, v <span class="hljs-keyword">in</span> batch.items()} | |
| outputs = trainer.model.to(device)(**batch)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-gwlw2w">Se si verifica ancora un errore, assicurarsi di riavviare il notebook ed eseguire solo l’ultima versione dello script.</p> <h3 class="relative group"><a id="esecuzione-di-un-passaggio-di-ottimizzazione" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#esecuzione-di-un-passaggio-di-ottimizzazione"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Esecuzione di un passaggio di ottimizzazione</span></h3> <p data-svelte-h="svelte-16nyvye">Ora che sappiamo che possiamo costruire batch che passano effettivamente attraverso il modello, siamo pronti per la fase successiva della pipeline di addestramento: calcolare i gradienti ed eseguire una fase di ottimizzazione.</p> <p data-svelte-h="svelte-12vqrqe">La prima parte consiste nel richiamare il metodo <code>backward()</code> sulla loss:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->loss = outputs.loss | |
| loss.backward()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-yiimkc">È abbastanza raro che si verifichi un errore in questa fase, ma se si verifica, assicurati di tornare ad usare la CPU per ottenere un messaggio di errore più utile.</p> <p data-svelte-h="svelte-xg78hk">Per eseguire la fase di ottimizzazione, è sufficiente creare l’oggetto <code>optimizer</code> e richiamare il suo metodo <code>step()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trainer.create_optimizer() | |
| trainer.optimizer.step()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7wvbov">Anche in questo caso, se si utilizza l’ottimizzatore predefinito nel <code>Trainer</code>, non si dovrebbe ottenere un errore in questa fase, ma se hai un ottimizzatore personalizzato, potrebbero esserci dei problemi da risolvere. Non dimenticare di tornare alla CPU se ottieni uno strano errore CUDA in questa fase. A proposito di errori CUDA, prima abbiamo menzionato un caso speciale. Vediamo ora questo caso.</p> <h3 class="relative group"><a id="come-gestire-gli-errori-out-of-memory-di-cuda" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#come-gestire-gli-errori-out-of-memory-di-cuda"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Come gestire gli errori out-of-memory di CUDA</span></h3> <p data-svelte-h="svelte-1xjgmfh">Ogni volta che si riceve un messaggio di errore che inizia con <code>RuntimeError: CUDA out of memory</code>, indica che la memoria della GPU è esaurita. Questo errore non è direttamente collegato al codice e può verificarsi anche con uno script che funziona perfettamente. Questo errore significa che si è tentato di mettere troppe cose nella memoria interna della GPU e che si è verificato un errore. Come per altri errori di CUDA, è necessario riavviare il kernel per poter eseguire nuovamente l’allenamento.</p> <p data-svelte-h="svelte-z9hjuk">Per risolvere questo problema, è sufficiente utilizzare meno spazio sulla GPU, cosa che spesso è più facile a dirsi che a farsi. Per prima cosa, assicuratevi di non avere due modelli sulla GPU contemporaneamente (a meno che non sia necessario per il vostro problema, ovviamente). Poi, è probabile che si debba ridurre la dimensione del batch, in quanto influisce direttamente sulle dimensioni di tutti gli output intermedi del modello e dei loro gradienti. Se il problema persiste, si può considerare di utilizzare una versione più piccola del modello.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-q2er6j">Nella prossima parte del corso, esamineremo tecniche più avanzate che possono aiutare a ridurre l’impatto sulla memoria e ad affinare i modelli più grandi.</p></div> <h3 class="relative group"><a id="valutazione-del-modello" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#valutazione-del-modello"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Valutazione del modello</span></h3> <p data-svelte-h="svelte-7dspo7">Ora che abbiamo risolto tutti i problemi con il nostro codice, tutto è perfetto e l’addestramento dovrebbe girare senza intoppi, giusto? Non così veloce! Se si esegue il comando <code>trainer.train()</code>, all’inizio sembrerà tutto a posto, ma dopo un po’ si otterrà il seguente risultato:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># This will take a long time and error out, so you shouldn't run this cell</span> | |
| trainer.train()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->TypeError: only size-<span class="hljs-number">1</span> arrays can be converted to Python scalars<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-103takw">Ti accorgerai che questo errore compare durante la fase di valutazione, quindi è l’ultima cosa che dobbiamo debuggare.</p> <p data-svelte-h="svelte-cpi9zs">È possibile eseguire il ciclo di valutazione del <code>Trainer</code> indipendentemente dall’addestramento, in questo modo:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trainer.evaluate()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->TypeError: only size-<span class="hljs-number">1</span> arrays can be converted to Python scalars<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1wy054r">💡 Bisogna sempre assicurarsi di poter eseguire <code>trainer.evaluate()</code> prima di lanciare <code>trainer.train()</code>, per evitare di sprecare molte risorse di calcolo prima di incorrere in un errore.</p></div> <p data-svelte-h="svelte-18r78mm">Prima di tentare il debug di un problema nel ciclo di valutazione, è necessario assicurarsi di aver dato un’occhiata ai dati, di essere in grado di generare correttamente un batch e di poter eseguire il modello su di esso. Abbiamo completato tutti questi passaggi, quindi il codice seguente può essere eseguito senza errori:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> batch <span class="hljs-keyword">in</span> trainer.get_eval_dataloader(): | |
| <span class="hljs-keyword">break</span> | |
| batch = {k: v.to(device) <span class="hljs-keyword">for</span> k, v <span class="hljs-keyword">in</span> batch.items()} | |
| <span class="hljs-keyword">with</span> torch.no_grad(): | |
| outputs = trainer.model(**batch)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-12a6eh4">L’errore arriva più tardi, alla fine della fase di valutazione, e se guardiamo il traceback vediamo questo:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->~/git/datasets/src/datasets/metric.py <span class="hljs-keyword">in</span> add_batch(self, predictions, references) | |
| <span class="hljs-number">431</span> <span class="hljs-string">""" | |
| 432 batch = {"predictions": predictions, "references": references} | |
| --> 433 batch = self.info.features.encode_batch(batch) | |
| 434 if self.writer is None: | |
| 435 self._init_writer()</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1fz7k57">Questo ci dice che l’errore ha origine nel modulo <code>datasets/metric.py</code>, quindi si tratta di un problema con la nostra funzione <code>compute_metrics()</code>. La funzione accetta una tupla con i logit e le label come array NumPy, quindi proviamo a dargliela in pasto:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->predictions = outputs.logits.cpu().numpy() | |
| labels = batch[<span class="hljs-string">"labels"</span>].cpu().numpy() | |
| compute_metrics((predictions, labels))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->TypeError: only size-<span class="hljs-number">1</span> arrays can be converted to Python scalars<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-noq9wi">Otteniamo lo stesso errore, quindi il problema risiede sicuramente in quella funzione. Se guardiamo al suo codice, vediamo che sta solo trasferendo le <code>predictions</code> e le <code>labels</code> a <code>metric.compute()</code>. C’è quindi un problema con questo metodo? Non proprio. Diamo una rapida occhiata alle dimensioni:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->predictions.shape, labels.shape<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->((<span class="hljs-number">8</span>, <span class="hljs-number">3</span>), (<span class="hljs-number">8</span>,))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lqh3e1">Le nostre previsioni sono ancora dei logit, non le vere previsioni, ed è per questo che la metrica restituisce questo errore (un po’ oscuro). La soluzione è abbastanza semplice: basta aggiungere un argmax nella funzione <code>compute_metrics()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_metrics</span>(<span class="hljs-params">eval_pred</span>): | |
| predictions, labels = eval_pred | |
| predictions = np.argmax(predictions, axis=<span class="hljs-number">1</span>) | |
| <span class="hljs-keyword">return</span> metric.compute(predictions=predictions, references=labels) | |
| compute_metrics((predictions, labels))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">'accuracy'</span>: <span class="hljs-number">0.625</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-lt88y">Ora il nostro errore è stato risolto! Questo era l’ultimo, quindi il nostro script ora addestrerà correttamente un modello.</p> <p data-svelte-h="svelte-1rw0eah">Per riferimento, ecco lo script completamente corretto:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np | |
| <span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset, load_metric | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> ( | |
| AutoTokenizer, | |
| AutoModelForSequenceClassification, | |
| DataCollatorWithPadding, | |
| TrainingArguments, | |
| Trainer, | |
| ) | |
| raw_datasets = load_dataset(<span class="hljs-string">"glue"</span>, <span class="hljs-string">"mnli"</span>) | |
| model_checkpoint = <span class="hljs-string">"distilbert-base-uncased"</span> | |
| tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">preprocess_function</span>(<span class="hljs-params">examples</span>): | |
| <span class="hljs-keyword">return</span> tokenizer(examples[<span class="hljs-string">"premise"</span>], examples[<span class="hljs-string">"hypothesis"</span>], truncation=<span class="hljs-literal">True</span>) | |
| tokenized_datasets = raw_datasets.<span class="hljs-built_in">map</span>(preprocess_function, batched=<span class="hljs-literal">True</span>) | |
| model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=<span class="hljs-number">3</span>) | |
| args = TrainingArguments( | |
| <span class="hljs-string">f"distilbert-finetuned-mnli"</span>, | |
| evaluation_strategy=<span class="hljs-string">"epoch"</span>, | |
| save_strategy=<span class="hljs-string">"epoch"</span>, | |
| learning_rate=<span class="hljs-number">2e-5</span>, | |
| num_train_epochs=<span class="hljs-number">3</span>, | |
| weight_decay=<span class="hljs-number">0.01</span>, | |
| ) | |
| metric = load_metric(<span class="hljs-string">"glue"</span>, <span class="hljs-string">"mnli"</span>) | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_metrics</span>(<span class="hljs-params">eval_pred</span>): | |
| predictions, labels = eval_pred | |
| predictions = np.argmax(predictions, axis=<span class="hljs-number">1</span>) | |
| <span class="hljs-keyword">return</span> metric.compute(predictions=predictions, references=labels) | |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
| trainer = Trainer( | |
| model, | |
| args, | |
| train_dataset=tokenized_datasets[<span class="hljs-string">"train"</span>], | |
| eval_dataset=tokenized_datasets[<span class="hljs-string">"validation_matched"</span>], | |
| compute_metrics=compute_metrics, | |
| data_collator=data_collator, | |
| tokenizer=tokenizer, | |
| ) | |
| trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1yq201f">In questo caso, non ci sono più problemi e il nostro script affinerà un modello che dovrebbe dare risultati ragionevoli. Ma cosa possiamo fare quando l’addestramento procede senza errori e il modello addestrato non funziona affatto bene? Questa è la parte più difficile di machine learning e ti mostreremo alcune tecniche che possono aiutarti.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-17w7sr0">💡 Se si utilizza un ciclo di addestramento manuale, per il debug della pipeline di addestramento valgono gli stessi passaggi, ma è più facile separarli. Assicurati però di non aver dimenticato il <code>model.eval()</code> o il <code>model.train()</code> nei punti giusti, o lo <code>zero_grad()</code> a ogni passo!</p></div> <h2 class="relative group"><a id="debug-degli-errori-silenziosi-durante-laddestramento" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#debug-degli-errori-silenziosi-durante-laddestramento"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Debug degli errori silenziosi durante l’addestramento</span></h2> <p data-svelte-h="svelte-r6p8wy">Cosa possiamo fare per eseguire il debug di un addestramento che viene completato senza errori, ma che non produce buoni risultati? Qui ti daremo alcuni suggerimenti, ma sappi che questo tipo di debugging è la parte più difficile di machine learning e non esiste una soluzione magica.</p> <h3 class="relative group"><a id="controllare-i-dati-di-nuovo" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#controllare-i-dati-di-nuovo"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Controllare i dati (di nuovo!)</span></h3> <p data-svelte-h="svelte-juxokd">Il tuo modello imparerà qualcosa solo se è effettivamente possibile imparare qualcosa dai tuoi dati. Se c’è un bug che corrompe i dati o le label sono assegnate in modo casuale, è molto probabile che non si riesca ad addestrare il modello sul dataset. Quindi, inizia sempre con un doppio controllo degli input e delle label decodificate e poniti le seguenti domande:</p> <ul data-svelte-h="svelte-1ygrnog"><li>I dati decodificati sono comprensibili?</li> <li>Sei d’accordo con le label?</li> <li>C’è una label più comune delle altre?</li> <li>Quale dovrebbe essere la funzione di perdita/metrica se il modello predicesse una risposta a caso/sempre la stessa risposta?</li></ul> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-yqzwsk">⚠️ Se effettui un addestramento in modo distribuito, stampa campioni del set di dati in ogni processo e controlla molto attentamente che ottieni la stessa cosa. Un bug comune è la presenza di una qualche fonte di casualità nella creazione dei dati che fa sì che ogni processo abbia una versione diversa del set di dati.</p></div> <p data-svelte-h="svelte-n5w149">Dopo aver esaminato i dati, esamina alcune previsioni del modello e decodificale. Se il modello prevede sempre la stessa cosa, potrebbe essere perché il tuo set di dati è influenzato verso una categoria (per i problemi di classificazione); tecniche come fare oversampling (<em>sovra-campionamento</em>) delle classi rare potrebbero aiutare.</p> <p data-svelte-h="svelte-1dmtka">Se la funzione di perdita/metrica ottenuta con il tuo modello iniziale è molto diversa da quella che ci si aspetterebbe per le previsioni casuali, ricontrolla il modo in cui viene calcolata la funzione o la metrica, perché probabilmente c’è un bug. Se si utilizzano diverse funzioni che aggiungi alla fine, assicurati che siano della stessa grandezza.</p> <p data-svelte-h="svelte-wddm9m">Quando sei sicuro/a che i dati sono perfetti, puoi verificare se il modello è in grado di addestrarsi su di essi con un semplice test.</p> <h3 class="relative group"><a id="fare-overfitting-del-modello-su-un-batch" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fare-overfitting-del-modello-su-un-batch"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Fare overfitting del modello su un batch</span></h3> <p data-svelte-h="svelte-x29dj2">L’overfitting è di solito qualcosa che cerchiamo di evitare durante l’addestramento, poiché significa che il modello non sta imparando a riconoscere le proprietà generali che vogliamo, ma sta invece memorizzando i campioni di addestramento. Tuttavia, provare ad addestrare il modello su un batch più e più volte è un buon test per verificare se il problema così come è stato inquadrato può essere risolto dal modello che si sta cercando di addestrare. Inoltre, ti aiuterà a capire se il learning rate (<em>tasso di apprendimento</em>) iniziale è troppo alta.</p> <p data-svelte-h="svelte-d51vi8">Una volta definito il <code>Trainer</code>, è molto semplice: basta prendere un batch dal training set, ed eseguire un piccolo ciclo di addestramento manuale utilizzando solo quel batch per qualcosa come 20 step:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> batch <span class="hljs-keyword">in</span> trainer.get_train_dataloader(): | |
| <span class="hljs-keyword">break</span> | |
| batch = {k: v.to(device) <span class="hljs-keyword">for</span> k, v <span class="hljs-keyword">in</span> batch.items()} | |
| trainer.create_optimizer() | |
| <span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">20</span>): | |
| outputs = trainer.model(**batch) | |
| loss = outputs.loss | |
| loss.backward() | |
| trainer.optimizer.step() | |
| trainer.optimizer.zero_grad()<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1ueptnc">💡 Se i dati di addestramento sono sbilanciati, assicurati di creare un batch di dati di addestramento contenente tutte le label.</p></div> <p data-svelte-h="svelte-13qqe72">Il modello risultante dovrebbe avere risultati quasi perfetti sullo stesso <code>batch</code>. Calcoliamo la metrica sulle previsioni risultanti:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">with</span> torch.no_grad(): | |
| outputs = trainer.model(**batch) | |
| preds = outputs.logits | |
| labels = batch[<span class="hljs-string">"labels"</span>] | |
| compute_metrics((preds.cpu().numpy(), labels.cpu().numpy()))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">'accuracy'</span>: <span class="hljs-number">1.0</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ugnqo2">100% di accuratezza, questo è un bell’esempio di overfitting (il che significa che se provi il tuo modello su qualsiasi altra frase, molto probabilmente ti darà una risposta sbagliata)!</p> <p data-svelte-h="svelte-1luvbcs">Se non si riesci a far sì che il modello ottenga risultati perfetti come questo, significa che c’è qualcosa di sbagliato nel modo in cui si è impostato il problema o con i dati, e quindi dovresti risolvere questa cosa. Solo quando riesci a superare il test di overfitting puoi essere sicuro/a che il tuo modello possa effettivamente imparare qualcosa.</p> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-wtiz5s">⚠️ Sarà necessario ricreare il modello e il <code>Trainer</code> dopo questo test, poiché il modello ottenuto probabilmente non sarà in grado di recuperare e imparare qualcosa di utile sul set di dati completo.</p></div> <h3 class="relative group"><a id="non-calibrare-niente-prima-di-avere-una-prima-baseline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#non-calibrare-niente-prima-di-avere-una-prima-baseline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Non calibrare niente prima di avere una prima baseline</span></h3> <p data-svelte-h="svelte-gth42h">Hyperparameter tuning (<em>calibrazione degli iperparametri</em>) è sempre considerato come la parte più difficile di machine learning, ma è solo l’ultimo passo per aiutarti a migliorare un po’ la metrica. Nella maggior parte dei casi, gli iperparametri predefiniti del <code>Trainer</code> funzionano bene per dare buoni risultati, quindi non ci si deve lanciare in una ricerca di iperparametri dispendiosa in termini di tempo e di costi, finché non si è ottenuto qualcosa che batta la baseline (<em>base di partenza</em>) che si ha sul dataset.</p> <p data-svelte-h="svelte-1ytlvoo">Una volta ottenuto un modello sufficientemente buono, si può iniziare a modificarlo un po’. Non provare a eseguire l’addestramento un migliaio di volte con iperparametri diversi, ma confronta un paio di esecuzioni che hanno valori diversi per un iperparametro così da avere un’idea di quale abbia il maggiore impatto.</p> <p data-svelte-h="svelte-1q3oxbc">Se stai modificando il modello stesso, mantieni le cose semplici e non provare nulla che non possa essere ragionevolmente giustificato. Assicurati sempre di rifare il test di overfitting per verificare che la modifica non abbia avuto conseguenze indesiderate.</p> <h3 class="relative group"><a id="chiedere-aiuto" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#chiedere-aiuto"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Chiedere aiuto</span></h3> <p data-svelte-h="svelte-1824t5s">Speriamo che in questa sezione tu abbia trovato qualche consiglio utile a risolvere il tuo problema, ma se così non fosse, ricordati che puoi sempre chiedere aiuto alla community nei <a href="https://discuss.huggingface.co/" rel="nofollow">forum</a>.</p> <p data-svelte-h="svelte-1guizfr">Qui di seguito sono riportate alcune risorse aggiuntive che potrebbero rivelarsi utili:</p> <ul data-svelte-h="svelte-rloazi"><li><a href="https://docs.google.com/presentation/d/1yHLPvPhUs2KGI5ZWo0sU-PKU3GimAk3iTsI38Z-B5Gw/edit#slide=id.p" rel="nofollow">“Reproducibility as a vehicle for engineering best practices”</a> di Joel Grus</li> <li><a href="https://towardsdatascience.com/checklist-for-debugging-neural-networks-d8b2a9434f21" rel="nofollow">“Checklist for debugging neural networks”</a> di Cecelia Shao</li> <li><a href="https://medium.com/@keeper6928/how-to-unit-test-machine-learning-code-57cf6fd81765" rel="nofollow">“How to unit test machine learning code”</a> di Chase Roberts</li> <li><a href="http://karpathy.github.io/2019/04/25/recipe/" rel="nofollow">“A Recipe for Training Neural Networks”</a> di Andrej Karpathy</li></ul> <p data-svelte-h="svelte-14oh3jw">Naturalmente, non tutti i problemi che incontrerai durante l’addestramento delle reti neurali sono colpa tua! Se si incontra qualcosa nella libreria 🤗 Transformers o 🤗 Datasets che non sembra corretto, è possibile che si sia trovato un bug. Dovresti assolutamente segnalarcelo e nella prossima sezione ti spiegheremo esattamente come fare.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/it/chapter8/4.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_u8ez91 = { | |
| assets: "/docs/course/pr_1069/it", | |
| base: "/docs/course/pr_1069/it", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/course/pr_1069/it/_app/immutable/entry/start.693d748d.js"), | |
| import("/docs/course/pr_1069/it/_app/immutable/entry/app.e9cfd099.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 44], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 148 kB
- Xet hash:
- 731071408002dda218fb228fbf259c6b783fe3eb6fb9b5b69508d0c36ddfc3d1
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.