Buckets:

rtrm's picture
download
raw
73.6 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Big data? 🤗 Datasets ao resgate&quot;,&quot;local&quot;:&quot;big-data--datasets-ao-resgate&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;O que é the Pile?&quot;,&quot;local&quot;:&quot;o-que-é-the-pile&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;A magia do mapeamento de memória&quot;,&quot;local&quot;:&quot;a-magia-do-mapeamento-de-memória&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Conjuntos de dados em streaming&quot;,&quot;local&quot;:&quot;conjuntos-de-dados-em-streaming&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1069/pt/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/entry/start.854f6ddb.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/scheduler.37c15a92.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/singletons.5a4db441.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/index.18351ede.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/paths.052a2e73.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/entry/app.99d5705b.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/index.2bf4358c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/nodes/0.24f28b67.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/nodes/31.d65aef3d.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/Tip.363c041f.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/Youtube.1e50a667.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/CodeBlock.4e987730.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/CourseFloatingBanner.6add7356.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/getInferenceSnippets.24b50994.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Big data? 🤗 Datasets ao resgate&quot;,&quot;local&quot;:&quot;big-data--datasets-ao-resgate&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;O que é the Pile?&quot;,&quot;local&quot;:&quot;o-que-é-the-pile&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;A magia do mapeamento de memória&quot;,&quot;local&quot;:&quot;a-magia-do-mapeamento-de-memória&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Conjuntos de dados em streaming&quot;,&quot;local&quot;:&quot;conjuntos-de-dados-em-streaming&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="big-data--datasets-ao-resgate" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#big-data--datasets-ao-resgate"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Big data? 🤗 Datasets ao resgate</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-5-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/pt/chapter5/section4.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/pt/chapter5/section4.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-1q486zx">Hoje em dia, não é incomum encontrar-se trabalhando com conjuntos de dados de vários gigabytes, especialmente se você planeja pré-treinar um transformer como BERT ou GPT-2 do zero. Nesses casos, até mesmo <em>carregar</em> os dados pode ser um desafio. Por exemplo, o corpus WebText usado para pré-treinar o GPT-2 consiste em mais de 8 milhões de documentos e 40 GB de texto - carregar isso na RAM do seu laptop provavelmente lhe causará um ataque cardíaco!</p> <p data-svelte-h="svelte-9tb4dk">Felizmente, 🤗 Datasets foram projetados para superar essas limitações. Ele libera você de problemas de gerenciamento de memória tratando conjuntos de dados como arquivos <em>memory-mapped</em> e de limites de disco rígido por <em>streaming</em> das entradas em um corpus.</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/JwISwTCPPWo" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-1uw732t">Nesta seção, exploraremos esses recursos de 🤗 Conjuntos de dados com um enorme corpus de 825 GB conhecido como <a href="https://pile.eleuther.ai" rel="nofollow">the Pile</a>. Vamos começar!</p> <h2 class="relative group"><a id="o-que-é-the-pile" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#o-que-é-the-pile"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>O que é the Pile?</span></h2> <p data-svelte-h="svelte-1qjf1np">O <code>The Pile</code> é um corpus de texto em inglês que foi criado por <a href="https://www.eleuther.ai" rel="nofollow">EleutherAI</a> para treinar modelos de linguagem em larga escala. Ele inclui uma gama diversificada de conjuntos de dados, abrangendo artigos científicos, repositórios de código do GitHub e texto da web filtrado. O corpus de treinamento está disponível em <a href="https://the-eye.eu/public/AI/pile/" rel="nofollow">blocos de 14 GB</a>, e você também pode baixar vários dos <a href="https://the-eye.eu/public/AI/pile_preliminary_components/" rel="nofollow">componentes individuais</a>. Vamos começar dando uma olhada no conjunto de dados PubMed Abstracts, que é um corpus de resumos de 15 milhões de publicações biomédicas no <a href="https://pubmed.ncbi.nlm.nih.gov/" rel="nofollow">PubMed</a>. O conjunto de dados está em <a href="https://jsonlines.org" rel="nofollow">formato JSON Lines</a> e é compactado usando a biblioteca <code>zstandard</code>, então primeiro precisamos instalá-lo:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!pip install zstandard<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-afsc53">Em seguida, podemos carregar o conjunto de dados usando o método para arquivos remotos que aprendemos na <a href="/course/chapter5/2">seção 2</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-comment"># This takes a few minutes to run, so go grab a tea or coffee while you wait :)</span>
data_files = <span class="hljs-string">&quot;https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst&quot;</span>
pubmed_dataset = load_dataset(<span class="hljs-string">&quot;json&quot;</span>, data_files=data_files, split=<span class="hljs-string">&quot;train&quot;</span>)
pubmed_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;meta&#x27;</span>, <span class="hljs-string">&#x27;text&#x27;</span>],
num_rows: <span class="hljs-number">15518009</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-iemqix">Podemos ver que há 15.518.009 linhas e 2 colunas em nosso conjunto de dados - isso é muito!</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-a6xli5">✎ Por padrão, 🤗 Datasets descompactará os arquivos necessários para carregar um dataset. Se você quiser preservar espaço no disco rígido, você pode passar <code>DownloadConfig(delete_extracted=True)</code> para o argumento <code>download_config</code> de <code>load_dataset()</code>. Consulte a <a href="https://huggingface.co/docs/datasets/package_reference/builder_classes#datasets.DownloadConfig" rel="nofollow">documentação</a> para obter mais detalhes.</p></div> <p data-svelte-h="svelte-1xjyfts">Vamos inspecionar o conteúdo do primeiro exemplo:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pubmed_dataset[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">&#x27;meta&#x27;</span>: {<span class="hljs-string">&#x27;pmid&#x27;</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">&#x27;language&#x27;</span>: <span class="hljs-string">&#x27;eng&#x27;</span>},
<span class="hljs-string">&#x27;text&#x27;</span>: <span class="hljs-string">&#x27;Epidemiology of hypoxaemia in children with acute lower respiratory infection.\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age ...&#x27;</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1bdr1nw">Ok, isso parece o resumo de um artigo médico. Agora vamos ver quanta RAM usamos para carregar o conjunto de dados!</p> <h2 class="relative group"><a id="a-magia-do-mapeamento-de-memória" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#a-magia-do-mapeamento-de-memória"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>A magia do mapeamento de memória</span></h2> <p data-svelte-h="svelte-1m7fz3m">Uma maneira simples de medir o uso de memória em Python é com a biblioteca <a href="https://psutil.readthedocs.io/en/latest/" rel="nofollow"><code>psutil</code></a>, que pode ser instalada com <code>pip</code> da seguinte forma:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!pip install psutil<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-3lzw0l">Ele fornece uma classe <code>Process</code> que nos permite verificar o uso de memória do processo atual da seguinte forma:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> psutil
<span class="hljs-comment"># Process.memory_info is expressed in bytes, so convert to megabytes</span>
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;RAM used: <span class="hljs-subst">{psutil.Process().memory_info().rss / (<span class="hljs-number">1024</span> * <span class="hljs-number">1024</span>):<span class="hljs-number">.2</span>f}</span> MB&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->RAM used: <span class="hljs-number">5678.33</span> MB<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1a7sky6">Aqui o atributo <code>rss</code> refere-se ao <em>tamanho do conjunto residente</em>, que é a fração de memória que um processo ocupa na RAM. Essa medida também inclui a memória usada pelo interpretador Python e as bibliotecas que carregamos, portanto, a quantidade real de memória usada para carregar o conjunto de dados é um pouco menor. Para comparação, vamos ver o tamanho do conjunto de dados no disco, usando o atributo <code>dataset_size</code>. Como o resultado é expresso em bytes como antes, precisamos convertê-lo manualmente para gigabytes:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Number of files in dataset : <span class="hljs-subst">{pubmed_dataset.dataset_size}</span>&quot;</span>)
size_gb = pubmed_dataset.dataset_size / (<span class="hljs-number">1024</span>**<span class="hljs-number">3</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Dataset size (cache file) : <span class="hljs-subst">{size_gb:<span class="hljs-number">.2</span>f}</span> GB&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Number of files <span class="hljs-keyword">in</span> dataset : <span class="hljs-number">20979437051</span>
Dataset size (cache file) : <span class="hljs-number">19.54</span> GB<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-c8hqos">Legal — apesar de ter quase 20 GB de tamanho, podemos carregar e acessar o conjunto de dados com muito menos RAM!</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1lvcqa6">✏️ <strong>Experimente!</strong> Escolha um dos <a href="https://the-eye.eu/public/AI/pile_preliminary_components/" rel="nofollow">subconjuntos</a> da <code>The Pile</code> que é maior que a RAM do seu laptop ou desktop, carregue com 🤗 Datasets e meça a quantidade de RAM usada. Observe que, para obter uma medição precisa, você desejará fazer isso em um novo processo. Você pode encontrar os tamanhos descompactados de cada subconjunto na Tabela 1 do <a href="https://arxiv.org/abs/2101.00027" rel="nofollow">artigo do <code>The Pile</code></a>.</p></div> <p data-svelte-h="svelte-lemoq5">Se você estiver familiarizado com Pandas, esse resultado pode ser uma surpresa por causa da famosa [regra de ouro] de Wes Kinney (<a href="https://wesmckinney.com/blog/apache-arrow-pandas-internals/" rel="nofollow">https://wesmckinney.com/blog/apache-arrow-pandas-internals/</a>) de que você normalmente precisa de 5 para 10 vezes mais RAM do que o tamanho do seu conjunto de dados. Então, como 🤗 Datasets resolve esse problema de gerenciamento de memória? 🤗 Os conjuntos de dados tratam cada conjunto de dados como um <a href="https://en.wikipedia.org/wiki/Memory-mapped_file" rel="nofollow">arquivo mapeado em memória</a>, que fornece um mapeamento entre RAM e armazenamento do sistema de arquivos que permite que a biblioteca acesse e opere em elementos do conjunto de dados sem precisar carregá-lo totalmente na memória.</p> <p data-svelte-h="svelte-1x65nwt">Arquivos mapeados em memória também podem ser compartilhados em vários processos, o que permite que métodos como <code>Dataset.map()</code> sejam paralelizados sem a necessidade de mover ou copiar o conjunto de dados. Sob o capô, esses recursos são todos realizados pelo formato de memória <a href="https://arrow.apache.org" rel="nofollow">Apache Arrow</a> e <a href="https://arrow.apache.org/docs/python/index.html" rel="nofollow"><code>pyarrow</code></a>, que tornam o carregamento e o processamento de dados extremamente rápidos. (Para mais detalhes sobre o Apache Arrow e comparações com o Pandas, confira <a href="https://towardsdatascience.com/apache-arrow-read-dataframe-with-zero-memory-69634092b1a" rel="nofollow">post do blog de Dejan Simic</a>.) Para ver isso em ação, vamos executar um pequeno teste de velocidade iterando sobre todos os elementos no conjunto de dados PubMed Abstracts:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> timeit
code_snippet = <span class="hljs-string">&quot;&quot;&quot;batch_size = 1000
for idx in range(0, len(pubmed_dataset), batch_size):
_ = pubmed_dataset[idx:idx + batch_size]
&quot;&quot;&quot;</span>
time = timeit.timeit(stmt=code_snippet, number=<span class="hljs-number">1</span>, <span class="hljs-built_in">globals</span>=<span class="hljs-built_in">globals</span>())
<span class="hljs-built_in">print</span>(
<span class="hljs-string">f&quot;Iterated over <span class="hljs-subst">{<span class="hljs-built_in">len</span>(pubmed_dataset)}</span> examples (about <span class="hljs-subst">{size_gb:<span class="hljs-number">.1</span>f}</span> GB) in &quot;</span>
<span class="hljs-string">f&quot;<span class="hljs-subst">{time:<span class="hljs-number">.1</span>f}</span>s, i.e. <span class="hljs-subst">{size_gb/time:<span class="hljs-number">.3</span>f}</span> GB/s&quot;</span>
)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">&#x27;Iterated over 15518009 examples (about 19.5 GB) in 64.2s, i.e. 0.304 GB/s&#x27;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ffobuu">Aqui usamos o módulo <code>timeit</code> do Python para medir o tempo de execução do <code>code_snippet</code>. Normalmente, você poderá iterar em um conjunto de dados a uma velocidade de alguns décimos de GB/s a vários GB/s. Isso funciona muito bem para a grande maioria dos aplicativos, mas às vezes você terá que trabalhar com um conjunto de dados grande demais para ser armazenado no disco rígido do seu laptop. Por exemplo, se tentássemos baixar o Pile por completo, precisaríamos de 825 GB de espaço livre em disco! Para lidar com esses casos, 🤗 Datasets fornece um recurso de streaming que nos permite baixar e acessar elementos em tempo real, sem a necessidade de baixar todo o conjunto de dados. Vamos dar uma olhada em como isso funciona.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-trq3jj">💡 Nos notebooks Jupyter, você também pode cronometrar células usando a <a href="https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-timeit" rel="nofollow"><code>%%timeit</code> função mágica</a>.</p></div> <h2 class="relative group"><a id="conjuntos-de-dados-em-streaming" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#conjuntos-de-dados-em-streaming"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Conjuntos de dados em streaming</span></h2> <p data-svelte-h="svelte-teh51o">Para habilitar o streaming do conjunto de dados você só precisa passar o argumento <code>streaming=True</code> para a função <code>load_dataset()</code>. Por exemplo, vamos carregar o conjunto de dados PubMed Abstracts novamente, mas em modo streaming:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pubmed_dataset_streamed = load_dataset(
<span class="hljs-string">&quot;json&quot;</span>, data_files=data_files, split=<span class="hljs-string">&quot;train&quot;</span>, streaming=<span class="hljs-literal">True</span>
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1dh8umf">Em vez do familiar <code>Dataset</code> que encontramos em outro lugar neste capítulo, o objeto retornado com <code>streaming=True</code> é um <code>IterableDataset</code>. Como o nome sugere, para acessar os elementos de um <code>IterableDataset</code> precisamos iterar sobre ele. Podemos acessar o primeiro elemento do nosso conjunto de dados transmitido da seguinte forma:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(pubmed_dataset_streamed))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">&#x27;meta&#x27;</span>: {<span class="hljs-string">&#x27;pmid&#x27;</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">&#x27;language&#x27;</span>: <span class="hljs-string">&#x27;eng&#x27;</span>},
<span class="hljs-string">&#x27;text&#x27;</span>: <span class="hljs-string">&#x27;Epidemiology of hypoxaemia in children with acute lower respiratory infection.\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age ...&#x27;</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1vpifmk">Os elementos de um conjunto de dados transmitido podem ser processados dinamicamente usando <code>IterableDataset.map()</code>, o que é útil durante o treinamento se você precisar tokenizar as entradas. O processo é exatamente o mesmo que usamos para tokenizar nosso conjunto de dados no <a href="/course/chapter3">Capítulo 3</a>, com a única diferença de que as saídas são retornadas uma a uma:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;distilbert-base-uncased&quot;</span>)
tokenized_dataset = pubmed_dataset_streamed.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> x: tokenizer(x[<span class="hljs-string">&quot;text&quot;</span>]))
<span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(tokenized_dataset))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">&#x27;input_ids&#x27;</span>: [<span class="hljs-number">101</span>, <span class="hljs-number">4958</span>, <span class="hljs-number">5178</span>, <span class="hljs-number">4328</span>, <span class="hljs-number">6779</span>, ...], <span class="hljs-string">&#x27;attention_mask&#x27;</span>: [<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, ...]}<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1kciw7b">💡 Para acelerar a tokenização com streaming você pode passar <code>batched=True</code>, como vimos na última seção. Ele processará os exemplos lote por lote; o tamanho do lote padrão é 1.000 e pode ser especificado com o argumento <code>batch_size</code>.</p></div> <p data-svelte-h="svelte-16pdpnn">Você também pode embaralhar um conjunto de dados transmitido usando <code>IterableDataset.shuffle()</code>, mas, diferentemente de <code>Dataset.shuffle()</code>, isso apenas embaralha os elementos em um <code>buffer_size</code> predefinido:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=<span class="hljs-number">10_000</span>, seed=<span class="hljs-number">42</span>)
<span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(shuffled_dataset))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">&#x27;meta&#x27;</span>: {<span class="hljs-string">&#x27;pmid&#x27;</span>: <span class="hljs-number">11410799</span>, <span class="hljs-string">&#x27;language&#x27;</span>: <span class="hljs-string">&#x27;eng&#x27;</span>},
<span class="hljs-string">&#x27;text&#x27;</span>: <span class="hljs-string">&#x27;Randomized study of dose or schedule modification of granulocyte colony-stimulating factor in platinum-based chemotherapy for elderly patients with lung cancer ...&#x27;</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jif4zv">Neste exemplo, selecionamos um exemplo aleatório dos primeiros 10.000 exemplos no buffer. Uma vez que um exemplo é acessado, seu lugar no buffer é preenchido com o próximo exemplo no corpus (ou seja, o 10.001º exemplo no caso acima). Você também pode selecionar elementos de um conjunto de dados transmitido usando as funções <code>IterableDataset.take()</code> e <code>IterableDataset.skip()</code>, que agem de maneira semelhante a <code>Dataset.select()</code>. Por exemplo, para selecionar os primeiros 5 exemplos no conjunto de dados PubMed Abstracts, podemos fazer o seguinte:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->dataset_head = pubmed_dataset_streamed.take(<span class="hljs-number">5</span>)
<span class="hljs-built_in">list</span>(dataset_head)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">&#x27;meta&#x27;</span>: {<span class="hljs-string">&#x27;pmid&#x27;</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">&#x27;language&#x27;</span>: <span class="hljs-string">&#x27;eng&#x27;</span>},
<span class="hljs-string">&#x27;text&#x27;</span>: <span class="hljs-string">&#x27;Epidemiology of hypoxaemia in children with acute lower respiratory infection ...&#x27;</span>},
{<span class="hljs-string">&#x27;meta&#x27;</span>: {<span class="hljs-string">&#x27;pmid&#x27;</span>: <span class="hljs-number">11409575</span>, <span class="hljs-string">&#x27;language&#x27;</span>: <span class="hljs-string">&#x27;eng&#x27;</span>},
<span class="hljs-string">&#x27;text&#x27;</span>: <span class="hljs-string">&#x27;Clinical signs of hypoxaemia in children with acute lower respiratory infection: indicators of oxygen therapy ...&#x27;</span>},
{<span class="hljs-string">&#x27;meta&#x27;</span>: {<span class="hljs-string">&#x27;pmid&#x27;</span>: <span class="hljs-number">11409576</span>, <span class="hljs-string">&#x27;language&#x27;</span>: <span class="hljs-string">&#x27;eng&#x27;</span>},
<span class="hljs-string">&#x27;text&#x27;</span>: <span class="hljs-string">&quot;Hypoxaemia in children with severe pneumonia in Papua New Guinea ...&quot;</span>},
{<span class="hljs-string">&#x27;meta&#x27;</span>: {<span class="hljs-string">&#x27;pmid&#x27;</span>: <span class="hljs-number">11409577</span>, <span class="hljs-string">&#x27;language&#x27;</span>: <span class="hljs-string">&#x27;eng&#x27;</span>},
<span class="hljs-string">&#x27;text&#x27;</span>: <span class="hljs-string">&#x27;Oxygen concentrators and cylinders ...&#x27;</span>},
{<span class="hljs-string">&#x27;meta&#x27;</span>: {<span class="hljs-string">&#x27;pmid&#x27;</span>: <span class="hljs-number">11409578</span>, <span class="hljs-string">&#x27;language&#x27;</span>: <span class="hljs-string">&#x27;eng&#x27;</span>},
<span class="hljs-string">&#x27;text&#x27;</span>: <span class="hljs-string">&#x27;Oxygen supply in rural africa: a personal experience ...&#x27;</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mv032g">Da mesma forma, você pode usar a função <code>IterableDataset.skip()</code> para criar divisões de treinamento e validação de um conjunto de dados embaralhado da seguinte forma:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Skip the first 1,000 examples and include the rest in the training set</span>
train_dataset = shuffled_dataset.skip(<span class="hljs-number">1000</span>)
<span class="hljs-comment"># Take the first 1,000 examples for the validation set</span>
validation_dataset = shuffled_dataset.take(<span class="hljs-number">1000</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7b97jo">Vamos completar nossa exploração de streaming de conjuntos de dados com um aplicativo comum: combinar vários conjuntos de dados para criar um único corpus. 🤗 Datasets fornece uma função <code>interleave_datasets()</code> que converte uma lista de objetos <code>IterableDataset</code> em um único <code>IterableDataset</code>, onde os elementos do novo conjunto de dados são obtidos alternando entre os exemplos de origem. Essa função é especialmente útil quando você está tentando combinar grandes conjuntos de dados, então, como exemplo, vamos transmitir o subconjunto FreeLaw do Pile, que é um conjunto de dados de 51 GB de pareceres jurídicos dos tribunais dos EUA:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->law_dataset_streamed = load_dataset(
<span class="hljs-string">&quot;json&quot;</span>,
data_files=<span class="hljs-string">&quot;https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst&quot;</span>,
split=<span class="hljs-string">&quot;train&quot;</span>,
streaming=<span class="hljs-literal">True</span>,
)
<span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(law_dataset_streamed))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">&#x27;meta&#x27;</span>: {<span class="hljs-string">&#x27;case_ID&#x27;</span>: <span class="hljs-string">&#x27;110921.json&#x27;</span>,
<span class="hljs-string">&#x27;case_jurisdiction&#x27;</span>: <span class="hljs-string">&#x27;scotus.tar.gz&#x27;</span>,
<span class="hljs-string">&#x27;date_created&#x27;</span>: <span class="hljs-string">&#x27;2010-04-28T17:12:49Z&#x27;</span>},
<span class="hljs-string">&#x27;text&#x27;</span>: <span class="hljs-string">&#x27;\n461 U.S. 238 (1983)\nOLIM ET AL.\nv.\nWAKINEKONA\nNo. 81-1581.\nSupreme Court of United States.\nArgued January 19, 1983.\nDecided April 26, 1983.\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\n*239 Michael A. Lilly, First Deputy Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...&#x27;</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-lgjetr">Esse conjunto de dados é grande o suficiente para sobrecarregar a RAM da maioria dos laptops, mas conseguimos carregá-lo e acessá-lo sem suar a camisa! Vamos agora combinar os exemplos dos conjuntos de dados FreeLaw e PubMed Abstracts com a função <code>interleave_datasets()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> itertools <span class="hljs-keyword">import</span> islice
<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> interleave_datasets
combined_dataset = interleave_datasets([pubmed_dataset_streamed, law_dataset_streamed])
<span class="hljs-built_in">list</span>(islice(combined_dataset, <span class="hljs-number">2</span>))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">&#x27;meta&#x27;</span>: {<span class="hljs-string">&#x27;pmid&#x27;</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">&#x27;language&#x27;</span>: <span class="hljs-string">&#x27;eng&#x27;</span>},
<span class="hljs-string">&#x27;text&#x27;</span>: <span class="hljs-string">&#x27;Epidemiology of hypoxaemia in children with acute lower respiratory infection ...&#x27;</span>},
{<span class="hljs-string">&#x27;meta&#x27;</span>: {<span class="hljs-string">&#x27;case_ID&#x27;</span>: <span class="hljs-string">&#x27;110921.json&#x27;</span>,
<span class="hljs-string">&#x27;case_jurisdiction&#x27;</span>: <span class="hljs-string">&#x27;scotus.tar.gz&#x27;</span>,
<span class="hljs-string">&#x27;date_created&#x27;</span>: <span class="hljs-string">&#x27;2010-04-28T17:12:49Z&#x27;</span>},
<span class="hljs-string">&#x27;text&#x27;</span>: <span class="hljs-string">&#x27;\n461 U.S. 238 (1983)\nOLIM ET AL.\nv.\nWAKINEKONA\nNo. 81-1581.\nSupreme Court of United States.\nArgued January 19, 1983.\nDecided April 26, 1983.\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\n*239 Michael A. Lilly, First Deputy Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...&#x27;</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-14cmuu4">Aqui usamos a função <code>islice()</code> do módulo <code>itertools</code> do Python para selecionar os dois primeiros exemplos do conjunto de dados combinado e podemos ver que eles correspondem aos primeiros exemplos de cada um dos dois conjuntos de dados de origem.</p> <p data-svelte-h="svelte-sgohav">Por fim, se você quiser transmitir o Pile em sua totalidade de 825 GB, poderá pegar todos os arquivos preparados da seguinte maneira:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->base_url = <span class="hljs-string">&quot;https://the-eye.eu/public/AI/pile/&quot;</span>
data_files = {
<span class="hljs-string">&quot;train&quot;</span>: [base_url + <span class="hljs-string">&quot;train/&quot;</span> + <span class="hljs-string">f&quot;<span class="hljs-subst">{idx:02d}</span>.jsonl.zst&quot;</span> <span class="hljs-keyword">for</span> idx <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">30</span>)],
<span class="hljs-string">&quot;validation&quot;</span>: base_url + <span class="hljs-string">&quot;val.jsonl.zst&quot;</span>,
<span class="hljs-string">&quot;test&quot;</span>: base_url + <span class="hljs-string">&quot;test.jsonl.zst&quot;</span>,
}
pile_dataset = load_dataset(<span class="hljs-string">&quot;json&quot;</span>, data_files=data_files, streaming=<span class="hljs-literal">True</span>)
<span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(pile_dataset[<span class="hljs-string">&quot;train&quot;</span>]))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">&#x27;meta&#x27;</span>: {<span class="hljs-string">&#x27;pile_set_name&#x27;</span>: <span class="hljs-string">&#x27;Pile-CC&#x27;</span>},
<span class="hljs-string">&#x27;text&#x27;</span>: <span class="hljs-string">&#x27;It is done, and submitted. You can play “Survival of the Tastiest” on Android, and on the web...&#x27;</span>}<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-6wh2xc">✏️ <strong>Experimente!</strong> Use um dos grandes corpora Common Crawl como <a href="https://huggingface.co/datasets/mc4" rel="nofollow"><code>mc4</code></a> ou <a href="https://huggingface.co/datasets/oscar" rel="nofollow"><code>oscar</code></a> para criar um conjunto de dados multilíngue de streaming que represente as proporções faladas de idiomas em um país de sua escolha. Por exemplo, as quatro línguas nacionais na Suíça são alemão, francês, italiano e romanche, então você pode tentar criar um corpus suíço amostrando os subconjuntos do Oscar de acordo com sua proporção falada.</p></div> <p data-svelte-h="svelte-1k4043d">Agora você tem todas as ferramentas necessárias para carregar e processar conjuntos de dados de todas as formas e tamanhos, mas, a menos que tenha muita sorte, chegará um ponto em sua jornada de PNL em que você terá que criar um conjunto de dados para resolver o problema. problema em mãos. Esse é o tema da próxima seção!</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/pt/chapter5/4.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1ef170a = {
assets: "/docs/course/pr_1069/pt",
base: "/docs/course/pr_1069/pt",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1069/pt/_app/immutable/entry/start.854f6ddb.js"),
import("/docs/course/pr_1069/pt/_app/immutable/entry/app.99d5705b.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 31],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
73.6 kB
·
Xet hash:
6148388504f8c6616841d7f0d2479731bb035f893e43b2cb7f28063470c8ee4c

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.