{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "77108016d60544c68f0119d0d5331f14": { "model_module": "@jupyter-widgets/controls", "model_name": "VBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "VBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "VBoxView", "box_style": "", "children": [], "layout": "IPY_MODEL_e6f8c479648e40b9b99aac32ef922eec" } }, "6fe0923245164569a40700dc54b9f401": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_0605f783465f49088a98f449c281ab94", "placeholder": "​", "style": "IPY_MODEL_af3c7f25462a48f796972cca107ee7d3", "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" } }, "2fc1eeff37814dd09a821385641faa52": { "model_module": "@jupyter-widgets/controls", "model_name": "PasswordModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "PasswordModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "PasswordView", "continuous_update": true, "description": "Token:", "description_tooltip": null, "disabled": false, "layout": "IPY_MODEL_e1626269dd8a4a7281be4a10fd2486c1", "placeholder": "​", "style": "IPY_MODEL_fb2504b90bbe485183e94cd4ce129f4a", "value": "" } }, "522487923851431197922709d2036e13": { "model_module": "@jupyter-widgets/controls", "model_name": "CheckboxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "CheckboxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "CheckboxView", "description": "Add token as git credential?", "description_tooltip": null, "disabled": false, "indent": true, "layout": "IPY_MODEL_866417963f5346a89bd9fe673c199dc6", "style": "IPY_MODEL_e9c3b4eb72704894ad18ddaf871e4587", "value": true } }, "347ad9c5c0614b63b9e6cd43e5ee27b6": { "model_module": "@jupyter-widgets/controls", "model_name": "ButtonModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ButtonView", "button_style": "", "description": "Login", "disabled": false, "icon": "", "layout": "IPY_MODEL_725111debb4c4a1f9179e4bd952c16cd", "style": "IPY_MODEL_f69b7d7de04c43d6822e62c96a13c2f8", "tooltip": "" } }, "3f5f3f637227480e829f3264d5fedb0d": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f5aac65ffeda42fd89e4217ccdda67e3", "placeholder": "​", "style": "IPY_MODEL_83622f36d35946f7befb4e07004ea7de", "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " } }, "e6f8c479648e40b9b99aac32ef922eec": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": "center", "align_self": null, "border": null, "bottom": null, "display": "flex", "flex": null, "flex_flow": "column", "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "50%" } }, "0605f783465f49088a98f449c281ab94": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "af3c7f25462a48f796972cca107ee7d3": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "e1626269dd8a4a7281be4a10fd2486c1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fb2504b90bbe485183e94cd4ce129f4a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "866417963f5346a89bd9fe673c199dc6": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e9c3b4eb72704894ad18ddaf871e4587": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "725111debb4c4a1f9179e4bd952c16cd": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f69b7d7de04c43d6822e62c96a13c2f8": { "model_module": "@jupyter-widgets/controls", "model_name": "ButtonStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "button_color": null, "font_weight": "" } }, "f5aac65ffeda42fd89e4217ccdda67e3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "83622f36d35946f7befb4e07004ea7de": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b0707adc2c90470cbca8681c5b879b25": { "model_module": "@jupyter-widgets/controls", "model_name": "LabelModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ce56996ca0514d2eabe695c1c043234f", "placeholder": "​", "style": "IPY_MODEL_87ae78407e9d4d8486740e984ec14327", "value": "Connecting..." } }, "ce56996ca0514d2eabe695c1c043234f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "87ae78407e9d4d8486740e984ec14327": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "1007d759823a42de9f368fbadc56e848": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_f1757b0c26e744cd9a004448d1e01e85", "IPY_MODEL_d7113be84dc24cc1829b57f137e177b5", "IPY_MODEL_4140d523a1444658b4491dc136ca0cf7" ], "layout": "IPY_MODEL_b0e46aa67eb64ca498d12eabf70cf165" } }, "f1757b0c26e744cd9a004448d1e01e85": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_d2439966e0924dab89bee28926346505", "placeholder": "​", "style": "IPY_MODEL_5057789732f646dc9c936399170c282a", "value": "README.md: 100%" } }, "d7113be84dc24cc1829b57f137e177b5": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a95403263d064ccc863635fcf224e15d", "max": 30, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_9f3090c5cf2c48d78d4a73a34626ef72", "value": 30 } }, "4140d523a1444658b4491dc136ca0cf7": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e259f8abc23743debf59bcd4947d5699", "placeholder": "​", "style": "IPY_MODEL_d9a55305f6044af59596fa4ecbeeb162", "value": " 30.0/30.0 [00:00<00:00, 2.41kB/s]" } }, "b0e46aa67eb64ca498d12eabf70cf165": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d2439966e0924dab89bee28926346505": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5057789732f646dc9c936399170c282a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a95403263d064ccc863635fcf224e15d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9f3090c5cf2c48d78d4a73a34626ef72": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "e259f8abc23743debf59bcd4947d5699": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d9a55305f6044af59596fa4ecbeeb162": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "28f8363ba6294e05afd19ec172b242a9": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_3944c8aad42347529f8c63b769cda160", "IPY_MODEL_bc6f1306aa7d47c0b710b64ebf12c0ef", "IPY_MODEL_ace40c3870a64991a3b5c4622961984b" ], "layout": "IPY_MODEL_e2fcea8a0a4e452c997d3007a6fe7a10" } }, "3944c8aad42347529f8c63b769cda160": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_3d329669a1cd48b69cc808f71ecf57bb", "placeholder": "​", "style": "IPY_MODEL_65a2b59724a447ebab7348df4320dca9", "value": "dataset.json: 100%" } }, "bc6f1306aa7d47c0b710b64ebf12c0ef": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_4f02f620f3f84b9a92ae578e94977e76", "max": 11582708, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_fafee59c6b3e4bdba63397c1761bb5a4", "value": 11582708 } }, "ace40c3870a64991a3b5c4622961984b": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_6d2c7548e5a2491594f82f87c08b80d9", "placeholder": "​", "style": "IPY_MODEL_e3a31f7404f545bf9765a74449ab8961", "value": " 11.6M/11.6M [00:03<00:00, 57.9MB/s]" } }, "e2fcea8a0a4e452c997d3007a6fe7a10": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3d329669a1cd48b69cc808f71ecf57bb": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "65a2b59724a447ebab7348df4320dca9": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "4f02f620f3f84b9a92ae578e94977e76": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fafee59c6b3e4bdba63397c1761bb5a4": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "6d2c7548e5a2491594f82f87c08b80d9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e3a31f7404f545bf9765a74449ab8961": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "9fe06806958e41f9a7d80434d617f20e": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_ad57d5995ef04af4ad1e8069e45f2966", "IPY_MODEL_2d4ba84863a243eab6cff83612c97a47", "IPY_MODEL_bb68ebd86e61462bacbde79207e5ad19" ], "layout": "IPY_MODEL_8695d613a7b4490689733ee5d9e84b06" } }, "ad57d5995ef04af4ad1e8069e45f2966": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_39be1bf72ff84b09ab94712657fa5c95", "placeholder": "​", "style": "IPY_MODEL_aad48157ae4247b88910b5bf451aa8b3", "value": "Generating train split: " } }, "2d4ba84863a243eab6cff83612c97a47": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_4f115da2877a474d83c10b000df6a2cb", "max": 1, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_1349caeb3c574ac8b50ffba037e1458c", "value": 1 } }, "bb68ebd86e61462bacbde79207e5ad19": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ca5706e6910a40f3ac222865b840192e", "placeholder": "​", "style": "IPY_MODEL_c46f7db66c0840b3a78082ae8e71da2d", "value": " 10642/0 [00:00<00:00, 24744.91 examples/s]" } }, "8695d613a7b4490689733ee5d9e84b06": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "39be1bf72ff84b09ab94712657fa5c95": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "aad48157ae4247b88910b5bf451aa8b3": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "4f115da2877a474d83c10b000df6a2cb": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "20px" } }, "1349caeb3c574ac8b50ffba037e1458c": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "ca5706e6910a40f3ac222865b840192e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c46f7db66c0840b3a78082ae8e71da2d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nJKY-qosqjsC", "outputId": "6157751e-b021-427c-e04c-0a523d7544d8" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting underthesea\n", " Downloading underthesea-9.5.0-py3-none-any.whl.metadata (10 kB)\n", "Requirement already satisfied: datasets in /usr/local/lib/python3.12/dist-packages (4.0.0)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (1.6.1)\n", "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.12/dist-packages (1.11.0)\n", "Requirement already satisfied: Click>=6.0 in /usr/local/lib/python3.12/dist-packages (from underthesea) (8.3.3)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from underthesea) (4.67.3)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from underthesea) (2.32.4)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from underthesea) (1.5.3)\n", "Requirement already satisfied: PyYAML in /usr/local/lib/python3.12/dist-packages (from underthesea) (6.0.3)\n", "Collecting underthesea_core>=3.3.0 (from underthesea)\n", " Downloading underthesea_core-3.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.7 kB)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from datasets) (3.29.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.12/dist-packages (from datasets) (2.0.2)\n", "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (18.1.0)\n", "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (0.3.8)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (from datasets) (2.2.2)\n", "Requirement already satisfied: xxhash in /usr/local/lib/python3.12/dist-packages (from datasets) (3.6.0)\n", "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.12/dist-packages (from datasets) (0.70.16)\n", "Requirement already satisfied: fsspec<=2025.3.0,>=2023.1.0 in /usr/local/lib/python3.12/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2025.3.0)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from datasets) (26.1)\n", "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.16.3)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (3.6.0)\n", "Requirement already satisfied: hf-xet<2.0.0,>=1.4.3 in /usr/local/lib/python3.12/dist-packages (from huggingface_hub) (1.4.3)\n", "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from huggingface_hub) (0.28.1)\n", "Requirement already satisfied: typer in /usr/local/lib/python3.12/dist-packages (from huggingface_hub) (0.24.2)\n", "Requirement already satisfied: typing-extensions>=4.1.0 in /usr/local/lib/python3.12/dist-packages (from huggingface_hub) (4.15.0)\n", "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.12/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (3.13.5)\n", "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface_hub) (4.13.0)\n", "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface_hub) (2026.4.22)\n", "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface_hub) (1.0.9)\n", "Requirement already satisfied: idna in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface_hub) (3.13)\n", "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->huggingface_hub) (0.16.0)\n", "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->underthesea) (3.4.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->underthesea) (2.5.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2026.1)\n", "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.12/dist-packages (from typer->huggingface_hub) (1.5.4)\n", "Requirement already satisfied: rich>=12.3.0 in /usr/local/lib/python3.12/dist-packages (from typer->huggingface_hub) (13.9.4)\n", "Requirement already satisfied: annotated-doc>=0.0.2 in /usr/local/lib/python3.12/dist-packages (from typer->huggingface_hub) (0.0.4)\n", "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2.6.1)\n", "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.4.0)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (26.1.0)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.8.0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (6.7.1)\n", "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (0.4.1)\n", "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.23.0)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n", "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.12/dist-packages (from rich>=12.3.0->typer->huggingface_hub) (4.0.0)\n", "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.12/dist-packages (from rich>=12.3.0->typer->huggingface_hub) (2.20.0)\n", "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.12/dist-packages (from markdown-it-py>=2.2.0->rich>=12.3.0->typer->huggingface_hub) (0.1.2)\n", "Downloading underthesea-9.5.0-py3-none-any.whl (7.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.3/7.3 MB\u001b[0m \u001b[31m95.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading underthesea_core-3.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m78.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: underthesea_core, underthesea\n", "Successfully installed underthesea-9.5.0 underthesea_core-3.3.0\n" ] } ], "source": [ "!pip install underthesea datasets scikit-learn huggingface_hub" ] }, { "cell_type": "code", "source": [ "from huggingface_hub import login\n", "\n", "login()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 141, "referenced_widgets": [ "77108016d60544c68f0119d0d5331f14", "6fe0923245164569a40700dc54b9f401", "2fc1eeff37814dd09a821385641faa52", "522487923851431197922709d2036e13", "347ad9c5c0614b63b9e6cd43e5ee27b6", "3f5f3f637227480e829f3264d5fedb0d", "e6f8c479648e40b9b99aac32ef922eec", "0605f783465f49088a98f449c281ab94", "af3c7f25462a48f796972cca107ee7d3", "e1626269dd8a4a7281be4a10fd2486c1", "fb2504b90bbe485183e94cd4ce129f4a", "866417963f5346a89bd9fe673c199dc6", "e9c3b4eb72704894ad18ddaf871e4587", "725111debb4c4a1f9179e4bd952c16cd", "f69b7d7de04c43d6822e62c96a13c2f8", "f5aac65ffeda42fd89e4217ccdda67e3", "83622f36d35946f7befb4e07004ea7de", "b0707adc2c90470cbca8681c5b879b25", "ce56996ca0514d2eabe695c1c043234f", "87ae78407e9d4d8486740e984ec14327" ] }, "id": "_NC3VCthA9uV", "outputId": "4f04ae0c-7024-4e60-d033-3135c30a2185" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:93: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "VBox(children=(HTML(value='
\"] = 0\n", "vocab[\"\"] = 1\n", "\n", "print(\"Vocab size:\", len(vocab))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7JXwGAsErfKU", "outputId": "a4e53056-57b3-4c2b-8ff6-df4682e6075b" }, "execution_count": 11, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Vocab size: 37282\n" ] } ] }, { "cell_type": "code", "source": [ "MAX_LEN = 200\n", "\n", "def encode_tokens(tokens):\n", "\n", " ids = [\n", " vocab.get(token, 1)\n", " for token in tokens\n", " ]\n", "\n", " ids = ids[:MAX_LEN]\n", "\n", " ids += [0] * (MAX_LEN - len(ids))\n", "\n", " return ids" ], "metadata": { "id": "wkyQrMKtriSH" }, "execution_count": 12, "outputs": [] }, { "cell_type": "code", "source": [ "for item in train_data:\n", " item[\"input_ids\"] = encode_tokens(\n", " item[\"tokens\"]\n", " )\n", "\n", "for item in val_data:\n", " item[\"input_ids\"] = encode_tokens(\n", " item[\"tokens\"]\n", " )\n", "\n", "for item in test_data:\n", " item[\"input_ids\"] = encode_tokens(\n", " item[\"tokens\"]\n", " )" ], "metadata": { "id": "lm2l1k_nrkVC" }, "execution_count": 13, "outputs": [] }, { "cell_type": "code", "source": [ "def extract_features(data):\n", "\n", " return np.array([\n", " [\n", " item[\"num_words\"],\n", " item[\"avg_sentence_length\"],\n", " item[\"type_token_ratio\"]\n", " ]\n", " for item in data\n", " ])" ], "metadata": { "id": "f3qQ0YkzroId" }, "execution_count": 14, "outputs": [] }, { "cell_type": "code", "source": [ "scaler = StandardScaler()\n", "\n", "train_features = scaler.fit_transform(\n", " extract_features(train_data)\n", ")\n", "\n", "val_features = scaler.transform(\n", " extract_features(val_data)\n", ")\n", "\n", "test_features = scaler.transform(\n", " extract_features(test_data)\n", ")" ], "metadata": { "id": "-iLTQU_1rr1U" }, "execution_count": 15, "outputs": [] }, { "cell_type": "code", "source": [ "class TextDataset(Dataset):\n", "\n", " def __init__(self, data, features):\n", "\n", " self.data = data\n", " self.features = features\n", "\n", " def __len__(self):\n", "\n", " return len(self.data)\n", "\n", " def __getitem__(self, idx):\n", "\n", " item = self.data[idx]\n", "\n", " input_ids = torch.tensor(\n", " item[\"input_ids\"],\n", " dtype=torch.long\n", " )\n", "\n", " features = torch.tensor(\n", " self.features[idx],\n", " dtype=torch.float\n", " )\n", "\n", " label = torch.tensor(\n", " item[\"level\"],\n", " dtype=torch.long\n", " )\n", "\n", " return {\n", " \"input_ids\": input_ids,\n", " \"features\": features,\n", " \"label\": label\n", " }" ], "metadata": { "id": "SejAFYDnruZg" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "train_dataset = TextDataset(\n", " train_data,\n", " train_features\n", ")\n", "\n", "val_dataset = TextDataset(\n", " val_data,\n", " val_features\n", ")\n", "\n", "test_dataset = TextDataset(\n", " test_data,\n", " test_features\n", ")\n", "\n", "train_loader = DataLoader(\n", " train_dataset,\n", " batch_size=32,\n", " shuffle=True\n", ")\n", "\n", "val_loader = DataLoader(\n", " val_dataset,\n", " batch_size=32\n", ")\n", "\n", "test_loader = DataLoader(\n", " test_dataset,\n", " batch_size=32\n", ")" ], "metadata": { "id": "zUykY2XarwbM" }, "execution_count": 17, "outputs": [] }, { "cell_type": "code", "source": [ "class TextCNN(nn.Module):\n", "\n", " def __init__(\n", " self,\n", " vocab_size,\n", " embed_dim=128,\n", " num_filters=100,\n", " filter_sizes=[3,4,5],\n", " num_features=3,\n", " num_classes=3,\n", " dropout=0.3\n", " ):\n", " super().__init__()\n", "\n", " self.embedding = nn.Embedding(\n", " vocab_size,\n", " embed_dim,\n", " padding_idx=0\n", " )\n", "\n", " self.convs = nn.ModuleList([\n", " nn.Conv1d(\n", " in_channels=embed_dim,\n", " out_channels=num_filters,\n", " kernel_size=fs\n", " )\n", " for fs in filter_sizes\n", " ])\n", "\n", " self.dropout = nn.Dropout(dropout)\n", "\n", " cnn_output_dim = (\n", " num_filters * len(filter_sizes)\n", " )\n", "\n", " self.classifier = nn.Sequential(\n", "\n", " nn.Linear(\n", " cnn_output_dim + num_features,\n", " 128\n", " ),\n", "\n", " nn.ReLU(),\n", "\n", " nn.Dropout(dropout),\n", "\n", " nn.Linear(128, num_classes)\n", " )\n", "\n", " def forward(self, input_ids, features):\n", "\n", " x = self.embedding(input_ids)\n", "\n", " # [batch, seq_len, embed_dim]\n", " x = x.permute(0, 2, 1)\n", "\n", " # [batch, embed_dim, seq_len]\n", "\n", " conv_outputs = []\n", "\n", " for conv in self.convs:\n", "\n", " c = torch.relu(conv(x))\n", "\n", " pooled = torch.max(\n", " c,\n", " dim=2\n", " )[0]\n", "\n", " conv_outputs.append(pooled)\n", "\n", " cnn_features = torch.cat(\n", " conv_outputs,\n", " dim=1\n", " )\n", "\n", " cnn_features = self.dropout(\n", " cnn_features\n", " )\n", "\n", " combined = torch.cat(\n", " [cnn_features, features],\n", " dim=1\n", " )\n", "\n", " logits = self.classifier(\n", " combined\n", " )\n", "\n", " return logits" ], "metadata": { "id": "_whVMqS8r0Gv" }, "execution_count": 18, "outputs": [] }, { "cell_type": "code", "source": [ "device = torch.device(\n", " \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", ")\n", "\n", "model = TextCNN(\n", " vocab_size=len(vocab)\n", ").to(device)\n", "\n", "train_labels = [\n", " item[\"level\"]\n", " for item in train_data\n", "]\n", "\n", "weights = compute_class_weight(\n", " class_weight=\"balanced\",\n", " classes=np.array([0,1,2]),\n", " y=train_labels\n", ")\n", "\n", "weights = torch.tensor(\n", " weights,\n", " dtype=torch.float\n", ")\n", "\n", "criterion = nn.CrossEntropyLoss(\n", " weight=weights.to(device)\n", ")\n", "\n", "optimizer = torch.optim.Adam(\n", " model.parameters(),\n", " lr=5e-4\n", ")" ], "metadata": { "id": "JiDkgq--r2aT" }, "execution_count": 19, "outputs": [] }, { "cell_type": "code", "source": [ "best_val_f1 = 0\n", "\n", "patience = 3\n", "counter = 0\n", "\n", "EPOCHS = 10\n", "\n", "for epoch in range(EPOCHS):\n", "\n", " # TRAIN\n", " model.train()\n", "\n", " total_loss = 0\n", "\n", " for batch in train_loader:\n", "\n", " input_ids = batch[\"input_ids\"].to(device)\n", " features = batch[\"features\"].to(device)\n", " labels = batch[\"label\"].to(device)\n", "\n", " logits = model(\n", " input_ids,\n", " features\n", " )\n", "\n", " loss = criterion(\n", " logits,\n", " labels\n", " )\n", "\n", " optimizer.zero_grad()\n", "\n", " loss.backward()\n", "\n", " optimizer.step()\n", "\n", " total_loss += loss.item()\n", "\n", " # VALIDATION\n", " model.eval()\n", "\n", " all_preds = []\n", " all_labels = []\n", "\n", " with torch.no_grad():\n", "\n", " for batch in val_loader:\n", "\n", " input_ids = batch[\"input_ids\"].to(device)\n", " features = batch[\"features\"].to(device)\n", " labels = batch[\"label\"].to(device)\n", "\n", " logits = model(\n", " input_ids,\n", " features\n", " )\n", "\n", " preds = torch.argmax(\n", " logits,\n", " dim=1\n", " )\n", "\n", " all_preds.extend(\n", " preds.cpu().numpy()\n", " )\n", "\n", " all_labels.extend(\n", " labels.cpu().numpy()\n", " )\n", "\n", " val_f1 = f1_score(\n", " all_labels,\n", " all_preds,\n", " average=\"macro\"\n", " )\n", "\n", " print(\n", " f\"Epoch {epoch+1} \"\n", " f\"| Loss: {total_loss:.4f} \"\n", " f\"| Val F1: {val_f1:.4f}\"\n", " )\n", "\n", " # EARLY STOPPING\n", " if val_f1 > best_val_f1:\n", "\n", " best_val_f1 = val_f1\n", "\n", " counter = 0\n", "\n", " torch.save(\n", " model.state_dict(),\n", " \"textcnn_best_model.pt\"\n", " )\n", "\n", " else:\n", " counter += 1\n", "\n", " if counter >= patience:\n", " print(\"Early stopping!\")\n", " break" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_LNmSylFr5cs", "outputId": "3d742ec1-78f5-40f7-a86c-d645f86569de" }, "execution_count": 20, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Epoch 1 | Loss: 229.7198 | Val F1: 0.6626\n", "Epoch 2 | Loss: 156.0087 | Val F1: 0.6983\n", "Epoch 3 | Loss: 115.6129 | Val F1: 0.8139\n", "Epoch 4 | Loss: 92.4094 | Val F1: 0.8098\n", "Epoch 5 | Loss: 76.6749 | Val F1: 0.8097\n", "Epoch 6 | Loss: 60.1265 | Val F1: 0.8181\n", "Epoch 7 | Loss: 48.3459 | Val F1: 0.8303\n", "Epoch 8 | Loss: 44.4590 | Val F1: 0.8336\n", "Epoch 9 | Loss: 39.0407 | Val F1: 0.7962\n", "Epoch 10 | Loss: 30.5372 | Val F1: 0.8379\n" ] } ] }, { "cell_type": "code", "source": [ "model.load_state_dict(\n", " torch.load(\"textcnn_best_model.pt\")\n", ")\n", "\n", "model.eval()\n", "\n", "all_preds = []\n", "all_labels = []\n", "\n", "with torch.no_grad():\n", "\n", " for batch in test_loader:\n", "\n", " input_ids = batch[\"input_ids\"].to(device)\n", " features = batch[\"features\"].to(device)\n", " labels = batch[\"label\"].to(device)\n", "\n", " logits = model(\n", " input_ids,\n", " features\n", " )\n", "\n", " preds = torch.argmax(\n", " logits,\n", " dim=1\n", " )\n", "\n", " all_preds.extend(\n", " preds.cpu().numpy()\n", " )\n", "\n", " all_labels.extend(\n", " labels.cpu().numpy()\n", " )\n", "\n", "print(\n", " classification_report(\n", " all_labels,\n", " all_preds,\n", " digits=4\n", " )\n", ")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "v9kOA5Jmr7lx", "outputId": "fb04c33a-ee7b-4adc-8f6a-6f714a535982" }, "execution_count": 21, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " precision recall f1-score support\n", "\n", " 0 0.7647 0.9286 0.8387 182\n", " 1 0.8587 0.8566 0.8577 844\n", " 2 0.8371 0.7828 0.8090 571\n", "\n", " accuracy 0.8384 1597\n", " macro avg 0.8202 0.8560 0.8351 1597\n", "weighted avg 0.8402 0.8384 0.8381 1597\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "torch.save(\n", " model.state_dict(),\n", " \"textcnn_pytorch_model.bin\"\n", ")\n", "\n", "with open(\"textcnn_vocab.pkl\", \"wb\") as f:\n", " pickle.dump(vocab, f)\n", "\n", "with open(\"textcnn_scaler.pkl\", \"wb\") as f:\n", " pickle.dump(scaler, f)\n", "\n", "config = {\n", " \"model_type\": \"TextCNN\",\n", " \"embedding_dim\": 128,\n", " \"num_filters\": 100,\n", " \"filter_sizes\": [3,4,5],\n", " \"max_length\": 200,\n", " \"num_classes\": 3\n", "}\n", "\n", "with open(\"textcnn_config.json\", \"w\") as f:\n", " json.dump(config, f, indent=4)" ], "metadata": { "id": "MduILtOLsHcC" }, "execution_count": 22, "outputs": [] } ] }