{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "59acb6a5958b4c198cb46454b7a6dec9": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_5ac625801dea4629aecece1ff517264a", "IPY_MODEL_55ca96d7fc30455bababc6bad1b7e1a5", "IPY_MODEL_46deb03028054b14a5b15823fef6ee87" ], "layout": "IPY_MODEL_0edf7dcbc72d4e58a38ae37f8254042f" } }, "5ac625801dea4629aecece1ff517264a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_4772e836196a4a7b956964f9167b56ee", "placeholder": "​", "style": "IPY_MODEL_56107b13ea3b4762bfdae3c4223423a4", "value": "tokenizer_config.json: 100%" } }, "55ca96d7fc30455bababc6bad1b7e1a5": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_cc4656efe2154ad3a45df827db7f1545", "max": 28, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_7667c315a9e349b1a68a569b4d139419", "value": 28 } }, "46deb03028054b14a5b15823fef6ee87": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_56cf608a62a54e3bb6fa58a9b957d9b0", "placeholder": "​", "style": "IPY_MODEL_c0c0aedf7a43453fab24711721427223", "value": " 28.0/28.0 [00:00<00:00, 1.58kB/s]" } }, "0edf7dcbc72d4e58a38ae37f8254042f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4772e836196a4a7b956964f9167b56ee": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "56107b13ea3b4762bfdae3c4223423a4": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "cc4656efe2154ad3a45df827db7f1545": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7667c315a9e349b1a68a569b4d139419": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "56cf608a62a54e3bb6fa58a9b957d9b0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c0c0aedf7a43453fab24711721427223": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "41fe5c4b1b734bc4a41cbd1f2db00989": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_fdd3f19839c040e5a4a466460b5a2480", "IPY_MODEL_50d15161d5344de9b9cbeba28862f957", "IPY_MODEL_f05643a308d649d8b990813b50cbc4aa" ], "layout": "IPY_MODEL_62032165e7ab4bd393e096fc6221fcb7" } }, "fdd3f19839c040e5a4a466460b5a2480": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_1bdeb4fed33243ea83bf98428f369db3", "placeholder": "​", "style": "IPY_MODEL_f75043abf72e4739ba3bb86090215fc5", "value": "config.json: 100%" } }, "50d15161d5344de9b9cbeba28862f957": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_fc1b15b082594ffa94ecf60f532c8a5c", "max": 483, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_788452ff4de244dab85b8b8adc42b4ea", "value": 483 } }, "f05643a308d649d8b990813b50cbc4aa": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_59093618fafe46eea7a1e38bb20a9300", "placeholder": "​", "style": "IPY_MODEL_3f6fb8acd7e74442a894330c6b4767a8", "value": " 483/483 [00:00<00:00, 27.4kB/s]" } }, "62032165e7ab4bd393e096fc6221fcb7": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "1bdeb4fed33243ea83bf98428f369db3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f75043abf72e4739ba3bb86090215fc5": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "fc1b15b082594ffa94ecf60f532c8a5c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "788452ff4de244dab85b8b8adc42b4ea": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "59093618fafe46eea7a1e38bb20a9300": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "3f6fb8acd7e74442a894330c6b4767a8": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "0994304224d24789b23432c2f7de953a": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_177edf091f7749dfb5ef3f84a6d92c75", "IPY_MODEL_2429a72651864f7fa6c9a57eeccebea8", "IPY_MODEL_cd7ce4a3424a475b9b4a7ea47a71e914" ], "layout": "IPY_MODEL_f7ab5c61c83e4b029a9109e30acc13b4" } }, "177edf091f7749dfb5ef3f84a6d92c75": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5adeff780b084deea141f38bf1549301", "placeholder": "​", "style": "IPY_MODEL_d975f27f07904fdbbeb663b959c0e580", "value": "vocab.txt: 100%" } }, "2429a72651864f7fa6c9a57eeccebea8": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_bd059140d3f24b3381a96f2f4adca8c4", "max": 231508, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_88a2bcfb25e1420eb736839677e2753a", "value": 231508 } }, "cd7ce4a3424a475b9b4a7ea47a71e914": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_3d28bb240d504e2cb20329a47583a1db", "placeholder": "​", "style": "IPY_MODEL_d61521350b154468be12b2b2ce20b971", "value": " 232k/232k [00:00<00:00, 1.88MB/s]" } }, "f7ab5c61c83e4b029a9109e30acc13b4": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5adeff780b084deea141f38bf1549301": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d975f27f07904fdbbeb663b959c0e580": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "bd059140d3f24b3381a96f2f4adca8c4": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "88a2bcfb25e1420eb736839677e2753a": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "3d28bb240d504e2cb20329a47583a1db": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d61521350b154468be12b2b2ce20b971": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "fc4b486eed0a46d0b3dd6ee32a13ba48": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_91bedc8101a44adbb9deb5dae6908c7a", "IPY_MODEL_c2c80ee83cb14f7ba55b437bef36746f", "IPY_MODEL_d4691144447c4c6e949e956d9cc2acbf" ], "layout": "IPY_MODEL_1a90905ebad24062ae6aceef6596d923" } }, "91bedc8101a44adbb9deb5dae6908c7a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f94bc9cc61fa43e2ab811fae617eae53", "placeholder": "​", "style": "IPY_MODEL_4dd54f9cb56f4765bae7ebbcbb8ab4e5", "value": "tokenizer.json: 100%" } }, "c2c80ee83cb14f7ba55b437bef36746f": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5454a07844794801afaea942de3cda98", "max": 466062, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_cf071df905c5444286f15a9bf2b95828", "value": 466062 } }, "d4691144447c4c6e949e956d9cc2acbf": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_cd7813cb1eb845b8b85ce40c8658299a", "placeholder": "​", "style": "IPY_MODEL_cb475a7759d94e3e9fb891136af83e2b", "value": " 466k/466k [00:00<00:00, 30.3MB/s]" } }, "1a90905ebad24062ae6aceef6596d923": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f94bc9cc61fa43e2ab811fae617eae53": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4dd54f9cb56f4765bae7ebbcbb8ab4e5": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "5454a07844794801afaea942de3cda98": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "cf071df905c5444286f15a9bf2b95828": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "cd7813cb1eb845b8b85ce40c8658299a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "cb475a7759d94e3e9fb891136af83e2b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b45024a0ec8747a2bd2677b3d8e10352": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_483daaf25506490da97f687523439aaa", "IPY_MODEL_98b5dd2edffe4921a0a080c98bdd4792", "IPY_MODEL_1a41c82bf7504e8085bfd16d3bbf36a2" ], "layout": "IPY_MODEL_ab95197ced5449b496f1221079b36908" } }, "483daaf25506490da97f687523439aaa": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_470ddaf005054b25b05125b2fa7fde4a", "placeholder": "​", "style": "IPY_MODEL_65722a6fde5e4087834e7a741a315cae", "value": "model.safetensors: 100%" } }, "98b5dd2edffe4921a0a080c98bdd4792": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_516dc0635a214f06997574c4f902de97", "max": 267954768, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_dc72eb488b4547a38ac317f37261216a", "value": 267954768 } }, "1a41c82bf7504e8085bfd16d3bbf36a2": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_47cc743855a7459a90c127d3ab60b3a3", "placeholder": "​", "style": "IPY_MODEL_065bf7a1f04945c38aca86a0f972aca9", "value": " 268M/268M [00:01<00:00, 241MB/s]" } }, "ab95197ced5449b496f1221079b36908": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "470ddaf005054b25b05125b2fa7fde4a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "65722a6fde5e4087834e7a741a315cae": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "516dc0635a214f06997574c4f902de97": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "dc72eb488b4547a38ac317f37261216a": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "47cc743855a7459a90c127d3ab60b3a3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "065bf7a1f04945c38aca86a0f972aca9": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "f5e534edc11044aa936378c8cf562b38": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_25146f192d584689b4e371436f6d8b0a", "IPY_MODEL_689566da96cf4e169e1eb7be44ca1f74", "IPY_MODEL_b095a42665b147e0b180790051f4b330" ], "layout": "IPY_MODEL_04fec7188d2741b9a62a8ba2471bc763" } }, "25146f192d584689b4e371436f6d8b0a": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_6506147f04b144a69ce6968bcee16ab8", "placeholder": "​", "style": "IPY_MODEL_9cc61be2e6714cc8babdd4e59614c75e", "value": "100%" } }, "689566da96cf4e169e1eb7be44ca1f74": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a85f5ac290a74822b073cf9eeedadc03", "max": 8058, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_e85eeabeede8410d8b5a66953ce74b15", "value": 8058 } }, "b095a42665b147e0b180790051f4b330": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7bb6173bd48149b490e329b9117c7270", "placeholder": "​", "style": "IPY_MODEL_804707107d0b42108b26bb156310a326", "value": " 8058/8058 [05:54<00:00, 20.94it/s]" } }, "04fec7188d2741b9a62a8ba2471bc763": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "6506147f04b144a69ce6968bcee16ab8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9cc61be2e6714cc8babdd4e59614c75e": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a85f5ac290a74822b073cf9eeedadc03": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e85eeabeede8410d8b5a66953ce74b15": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "7bb6173bd48149b490e329b9117c7270": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "804707107d0b42108b26bb156310a326": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "markdown", "source": [ "Parsing of website, collecting data and preprocees it are in parsing.py" ], "metadata": { "id": "BAD4mFtLXxIs" } }, { "cell_type": "code", "source": [ "!python parsing.py" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PN1OyQH2jOu_", "outputId": "63dc9f1b-3d4c-4f05-fb86-f4988c7ddcb5" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Parse all scripts from this website https://fangj.github.io/friends/\n", "Total: 100% 228/228 [00:39<00:00, 5.81it/s]\n", "Number of characters in dataframe 196\n", "100% 6/6 [00:00<00:00, 9.65it/s]\n", "script created\n" ] } ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MtQc61jY2cEM", "outputId": "5f190a50-609f-4c3f-a26a-53eea42750bc" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package omw-1.4 to /root/nltk_data...\n", "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Unzipping tokenizers/punkt.zip.\n", "[nltk_data] Downloading package averaged_perceptron_tagger to\n", "[nltk_data] /root/nltk_data...\n", "[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\n", "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Unzipping corpora/stopwords.zip.\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": {}, "execution_count": 2 } ], "source": [ "import pandas as pd\n", "import nltk\n", "import numpy as np\n", "import os\n", "import re #regular expressions\n", "from nltk.stem import wordnet # for lemmtization\n", "from sklearn.feature_extraction.text import CountVectorizer # for bag of words (bow)\n", "from sklearn.feature_extraction.text import TfidfVectorizer #for tfidf\n", "from nltk import pos_tag # for parts of speech\n", "from sklearn.metrics import pairwise_distances # cosine similarity\n", "from nltk import word_tokenize\n", "from nltk.corpus import stopwords\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "nltk.download('omw-1.4') #this is for the .apply() function to work\n", "nltk.download('punkt')\n", "nltk.download('averaged_perceptron_tagger')\n", "nltk.download('wordnet')\n", "nltk.download('stopwords')" ] }, { "cell_type": "markdown", "source": [ "### Look at the stopwards from nltk library" ], "metadata": { "id": "By8mT6qCQFup" } }, { "cell_type": "code", "source": [ "for w in set(stopwords.words('english')):\n", " print(w, end=\" | \")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VTlstJxXDepv", "outputId": "36194c60-1f69-4e3a-e90e-20d799225fef" }, "execution_count": 3, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "up | doesn't | themselves | there | off | some | ain | yours | any | if | with | did | too | were | has | does | can | where | about | not | mustn | needn't | doing | once | they | all | below | herself | into | same | needn | you'll | couldn't | won | my | nor | this | through | didn | between | hers | yourself | s | wouldn | out | in | i | y | most | such | hasn | ourselves | under | for | o | by | she's | been | it's | further | couldn | ma | down | then | you're | didn't | me | than | these | here | until | she | be | because | haven't | why | while | won't | an | which | hadn't | had | he | own | no | are | at | of | theirs | t | having | aren't | against | weren | its | isn | it | don | we | her | again | don't | yourselves | wasn | mightn't | that | you'd | should | to | weren't | so | now | doesn | but | over | aren | ll | him | shan't | them | when | himself | how | only | being | during | whom | both | ours | or | have | hasn't | m | our | hadn | from | above | as | wouldn't | what | that'll | re | haven | his | just | the | isn't | on | each | mustn't | myself | do | is | d | wasn't | shan | shouldn't | mightn | after | very | you've | will | shouldn | your | more | and | those | other | their | should've | who | am | before | was | a | ve | you | itself | few | " ] } ] }, { "cell_type": "markdown", "source": [ "### Check the data for any character" ], "metadata": { "id": "iss7hUaErKu9" } }, { "cell_type": "code", "source": [ "df = pd.read_csv(\"rachel_friends.csv\") # read the database into a data frame\n", "# df = pd.read_csv(\"RickAndMortyScripts.csv\") # read the database into a data frame\n", "df.head(10) # see first 5 lines" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 363 }, "id": "_EjZKPu6_x2i", "outputId": "0471712a-5762-4531-b649-45818053d0c7" }, "execution_count": 4, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " questioner question answerer \\\n", "0 ross hey uh mon, i saw the porsche parked out front... rachel \n", "1 joey saw the porsche out there mon, lookin’ good. w... rachel \n", "2 monica nice work everybody! so much for the y’know, y... rachel \n", "3 ross whew! that was a brisk ride! rachel \n", "4 ross only way to fly. rachel \n", "5 ross you’re fast and irresponsible. that adds up to... rachel \n", "6 ross did you see the look that girl just gave me? h... rachel \n", "7 ross what?! give-give me a brush. rachel \n", "8 ross no way! rachel \n", "9 ross fine! y’know what? it doesn’t matter, because,... rachel \n", "\n", " answer \n", "0 wait a minute! you let ross drive the porsche... \n", "1 you let joey drive it?! \n", "2 wow! i can’t believe you lied to me. \n", "3 take the top down did ya? \n", "4 come on ross give me the keys! monica does not... \n", "5 well in high school, that added up to head che... \n", "6 i think she’s checking out your beehive ross. \n", "7 gimme the keys! \n", "8 well no brush! \n", "9 alimony. " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
questionerquestionanswereranswer
0rosshey uh mon, i saw the porsche parked out front...rachelwait a minute! you let ross drive the porsche...
1joeysaw the porsche out there mon, lookin’ good. w...rachelyou let joey drive it?!
2monicanice work everybody! so much for the y’know, y...rachelwow! i can’t believe you lied to me.
3rosswhew! that was a brisk ride!racheltake the top down did ya?
4rossonly way to fly.rachelcome on ross give me the keys! monica does not...
5rossyou’re fast and irresponsible. that adds up to...rachelwell in high school, that added up to head che...
6rossdid you see the look that girl just gave me? h...racheli think she’s checking out your beehive ross.
7rosswhat?! give-give me a brush.rachelgimme the keys!
8rossno way!rachelwell no brush!
9rossfine! y’know what? it doesn’t matter, because,...rachelalimony.
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 8058,\n \"fields\": [\n {\n \"column\": \"questioner\",\n \"properties\": {\n \"dtype\": \"category\",\n \"samples\": [\n \"susan\",\n \"chandler\",\n \"megan\"\n ],\n \"num_unique_values\": 117,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"question\",\n \"properties\": {\n \"dtype\": \"string\",\n \"samples\": [\n \"i can tell from your expressions that thats the good news you were hoping for... well, im gonna go continue to... spread the joy.\",\n \"you said two weeks.\",\n \"the game, rachel, the game. you owe us money for the game.\"\n ],\n \"num_unique_values\": 7117,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"answerer\",\n \"properties\": {\n \"dtype\": \"category\",\n \"samples\": [\n \"rachel\"\n ],\n \"num_unique_values\": 1,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"answer\",\n \"properties\": {\n \"dtype\": \"string\",\n \"samples\": [\n \"yeah. oh! was how you invented the cotton gin?!\"\n ],\n \"num_unique_values\": 6969,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 4 } ] }, { "cell_type": "code", "source": [ "df.iloc[2][\"question\"], df.iloc[2][\"answer\"]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9tpteg0Dry3F", "outputId": "471b1ce0-2669-4a70-d810-909536716495" }, "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "('nice work everybody! so much for the y’know, you can drive it, but don’t tell rachel plan!',\n", " 'wow! i can’t believe you lied to me.')" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "code", "source": [ "# check for null values / empty cells\n", "df.isnull().sum()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "7XpB4YIEM9Mb", "outputId": "c402906a-7065-453b-a615-5701dd135e89" }, "execution_count": 6, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "questioner 0\n", "question 0\n", "answerer 0\n", "answer 0\n", "dtype: int64" ] }, "metadata": {}, "execution_count": 6 } ] }, { "cell_type": "code", "source": [ "def text_normalization(text):\n", " text = str(text).lower() # convert to all lower letters\n", " spl_char_text = re.sub(r'[^a-z]', ' ', text) # remove any special characters including numbers\n", " tokens = nltk.word_tokenize(spl_char_text) # tokenize words\n", " lema = wordnet.WordNetLemmatizer() # lemmatizer initiation\n", " tags_list = pos_tag(tokens, tagset = None) # parts of speech\n", " lema_words = []\n", " for token, pos_token in tags_list:\n", " if pos_token.startswith('V'): # if the tag from tag_list is a verb, assign 'v' to it's pos_val\n", " pos_val = 'v'\n", " elif pos_token.startswith('J'): # adjective\n", " pos_val = 'a'\n", " elif pos_token.startswith('R'): # adverb\n", " pos_val = 'r'\n", " else: # otherwise it must be a noun\n", " pos_val = 'n'\n", " lema_token = lema.lemmatize(token, pos_val) # performing lemmatization\n", " lema_words.append(lema_token) # addid the lemmatized words into our list\n", " return \" \".join(lema_words) # return our list as a human sentence" ], "metadata": { "id": "OeAT0DJ3M_bp" }, "execution_count": 7, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Normalize questions" ], "metadata": { "id": "1Pm0AUcR6SdR" } }, { "cell_type": "code", "source": [ "question_normalized = df['question'].apply(text_normalization)\n", "df.insert(2, 'Normalized question', question_normalized, True)\n", "df.head(), df.size" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "hqXzLpIyZt1h", "outputId": "cd47ac32-0677-4036-99fc-345fe49cb3e9" }, "execution_count": 8, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "( questioner question \\\n", " 0 ross hey uh mon, i saw the porsche parked out front... \n", " 1 joey saw the porsche out there mon, lookin’ good. w... \n", " 2 monica nice work everybody! so much for the y’know, y... \n", " 3 ross whew! that was a brisk ride! \n", " 4 ross only way to fly. \n", " \n", " Normalized question answerer \\\n", " 0 hey uh mon i saw the porsche park out front ca... rachel \n", " 1 saw the porsche out there mon lookin good when... rachel \n", " 2 nice work everybody so much for the y know you... rachel \n", " 3 whew that be a brisk ride rachel \n", " 4 only way to fly rachel \n", " \n", " answer \n", " 0 wait a minute! you let ross drive the porsche... \n", " 1 you let joey drive it?! \n", " 2 wow! i can’t believe you lied to me. \n", " 3 take the top down did ya? \n", " 4 come on ross give me the keys! monica does not... ,\n", " 40290)" ] }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "markdown", "source": [ "### Define function which throw out stopwords from the questions" ], "metadata": { "id": "mGJxZ4j97eeG" } }, { "cell_type": "code", "source": [ "stop = stopwords.words('english') # Include stop words\n", "stop = [] # Exclude stopwords\n", "def removeStopWords(text):\n", " Q = []\n", " s = text.split() # create an array of words from our text sentence, cut it into words\n", " q = ''\n", " for w in s: # for every word in the given sentence if the word is a stop word ignore it\n", " if w in stop:\n", " continue\n", " else: # otherwise add it to the end of our array\n", " Q.append(w)\n", " q = \" \".join(Q) # create a sentence out of our array of non stop words\n", " return q" ], "metadata": { "id": "1Z9rfaRlNm-R" }, "execution_count": 9, "outputs": [] }, { "cell_type": "markdown", "source": [ "### In this case i decided to leave stopwords, because they are very useful in common speech. For example in question \"what are you doing?\" stopwords throws out every word which leads to empty question" ], "metadata": { "id": "nF8lEoIc7px8" } }, { "cell_type": "code", "source": [ "question_norm_and_stop = df['Normalized question'].apply(removeStopWords)\n", "df.insert(3, 'Normalized and StopWords question', question_norm_and_stop, True)\n", "df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 379 }, "id": "lrGl7c11alS2", "outputId": "b79e7455-1cf0-4b04-91a3-7bfba6a28651" }, "execution_count": 10, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " questioner question \\\n", "0 ross hey uh mon, i saw the porsche parked out front... \n", "1 joey saw the porsche out there mon, lookin’ good. w... \n", "2 monica nice work everybody! so much for the y’know, y... \n", "3 ross whew! that was a brisk ride! \n", "4 ross only way to fly. \n", "\n", " Normalized question \\\n", "0 hey uh mon i saw the porsche park out front ca... \n", "1 saw the porsche out there mon lookin good when... \n", "2 nice work everybody so much for the y know you... \n", "3 whew that be a brisk ride \n", "4 only way to fly \n", "\n", " Normalized and StopWords question answerer \\\n", "0 hey uh mon i saw the porsche park out front ca... rachel \n", "1 saw the porsche out there mon lookin good when... rachel \n", "2 nice work everybody so much for the y know you... rachel \n", "3 whew that be a brisk ride rachel \n", "4 only way to fly rachel \n", "\n", " answer \n", "0 wait a minute! you let ross drive the porsche... \n", "1 you let joey drive it?! \n", "2 wow! i can’t believe you lied to me. \n", "3 take the top down did ya? \n", "4 come on ross give me the keys! monica does not... " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
questionerquestionNormalized questionNormalized and StopWords questionanswereranswer
0rosshey uh mon, i saw the porsche parked out front...hey uh mon i saw the porsche park out front ca...hey uh mon i saw the porsche park out front ca...rachelwait a minute! you let ross drive the porsche...
1joeysaw the porsche out there mon, lookin’ good. w...saw the porsche out there mon lookin good when...saw the porsche out there mon lookin good when...rachelyou let joey drive it?!
2monicanice work everybody! so much for the y’know, y...nice work everybody so much for the y know you...nice work everybody so much for the y know you...rachelwow! i can’t believe you lied to me.
3rosswhew! that was a brisk ride!whew that be a brisk ridewhew that be a brisk rideracheltake the top down did ya?
4rossonly way to fly.only way to flyonly way to flyrachelcome on ross give me the keys! monica does not...
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 8058,\n \"fields\": [\n {\n \"column\": \"questioner\",\n \"properties\": {\n \"dtype\": \"category\",\n \"samples\": [\n \"susan\",\n \"chandler\",\n \"megan\"\n ],\n \"num_unique_values\": 117,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"question\",\n \"properties\": {\n \"dtype\": \"string\",\n \"samples\": [\n \"i can tell from your expressions that thats the good news you were hoping for... well, im gonna go continue to... spread the joy.\",\n \"you said two weeks.\",\n \"the game, rachel, the game. you owe us money for the game.\"\n ],\n \"num_unique_values\": 7117,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Normalized question\",\n \"properties\": {\n \"dtype\": \"string\",\n \"samples\": [\n \"okay here we go\",\n \"oh that d be great\",\n \"rachel ill just call her back\"\n ],\n \"num_unique_values\": 6917,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Normalized and StopWords question\",\n \"properties\": {\n \"dtype\": \"string\",\n \"samples\": [\n \"okay here we go\",\n \"oh that d be great\",\n \"rachel ill just call her back\"\n ],\n \"num_unique_values\": 6917,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"answerer\",\n \"properties\": {\n \"dtype\": \"category\",\n \"samples\": [\n \"rachel\"\n ],\n \"num_unique_values\": 1,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"answer\",\n \"properties\": {\n \"dtype\": \"string\",\n \"samples\": [\n \"yeah. oh! was how you invented the cotton gin?!\"\n ],\n \"num_unique_values\": 6969,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [ "df_filtered = df[df['question'].apply(lambda x: len(x.split()) == 1)]\n", "df_filtered" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 475 }, "id": "iEtVdweB0qdm", "outputId": "151b458d-4aeb-48d8-e1ea-4e3db87bb6c2" }, "execution_count": 11, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " questioner question Normalized question \\\n", "31 policeman nope. nope \n", "32 policeman nope. nope \n", "33 policeman nope. nope \n", "34 policeman yep. yep \n", "41 ross who? who \n", "... ... ... ... \n", "8007 phoebe where?! where \n", "8040 monica hormones. hormone \n", "8042 phoebe oh! oh \n", "8046 monica what?! what \n", "8055 monica yeah. yeah \n", "\n", " Normalized and StopWords question answerer \\\n", "31 nope rachel \n", "32 nope rachel \n", "33 nope rachel \n", "34 yep rachel \n", "41 who rachel \n", "... ... ... \n", "8007 where rachel \n", "8040 hormone rachel \n", "8042 oh rachel \n", "8046 what rachel \n", "8055 yeah rachel \n", "\n", " answer \n", "31 taurus? \n", "32 virgo? \n", "33 sagittarius? \n", "34 i knew it! i knew it, ahh…. \n", "41 fourth gear!! \n", "... ... \n", "8007 hey! \n", "8040 …hormones, yeah. \n", "8042 it just might be too hard, given the history a... \n", "8046 what?! she made the tea! \n", "8055 and y’know what else, oh my god, are they gonn... \n", "\n", "[999 rows x 6 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
questionerquestionNormalized questionNormalized and StopWords questionanswereranswer
31policemannope.nopenoperacheltaurus?
32policemannope.nopenoperachelvirgo?
33policemannope.nopenoperachelsagittarius?
34policemanyep.yepyepracheli knew it! i knew it, ahh….
41rosswho?whowhorachelfourth gear!!
.....................
8007phoebewhere?!wherewhererachelhey!
8040monicahormones.hormonehormonerachel…hormones, yeah.
8042phoebeoh!ohohrachelit just might be too hard, given the history a...
8046monicawhat?!whatwhatrachelwhat?! she made the tea!
8055monicayeah.yeahyeahracheland y’know what else, oh my god, are they gonn...
\n", "

999 rows × 6 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_filtered", "summary": "{\n \"name\": \"df_filtered\",\n \"rows\": 999,\n \"fields\": [\n {\n \"column\": \"questioner\",\n \"properties\": {\n \"dtype\": \"category\",\n \"samples\": [\n \"gavin\",\n \"dina\",\n \"nurse\"\n ],\n \"num_unique_values\": 68,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"question\",\n \"properties\": {\n \"dtype\": \"category\",\n \"samples\": [\n \"joey?\",\n \"yep?\",\n \"later.\"\n ],\n \"num_unique_values\": 300,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Normalized question\",\n \"properties\": {\n \"dtype\": \"category\",\n \"samples\": [\n \"floopy\",\n \"thanks\",\n \"be caaauuuse\"\n ],\n \"num_unique_values\": 203,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Normalized and StopWords question\",\n \"properties\": {\n \"dtype\": \"category\",\n \"samples\": [\n \"floopy\",\n \"thanks\",\n \"be caaauuuse\"\n ],\n \"num_unique_values\": 203,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"answerer\",\n \"properties\": {\n \"dtype\": \"category\",\n \"samples\": [\n \"rachel\"\n ],\n \"num_unique_values\": 1,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"answer\",\n \"properties\": {\n \"dtype\": \"string\",\n \"samples\": [\n \"joey! why did you tell chandler that monica was getting a boob job?\"\n ],\n \"num_unique_values\": 857,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 11 } ] }, { "cell_type": "code", "source": [ "import matplotlib.pyplot as plt" ], "metadata": { "id": "l6u6UTs2wpzF" }, "execution_count": 12, "outputs": [] }, { "cell_type": "code", "source": [ "length_sentences = df[\"Normalized question\"].apply(lambda x: len(x.split()))\n", "plt.hist(length_sentences, bins=20)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 576 }, "id": "Alz6ahZYwP23", "outputId": "3a154ea5-3c12-45ce-bd1d-b828c0bd0bc7" }, "execution_count": 13, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(array([3.554e+03, 2.050e+03, 1.083e+03, 5.920e+02, 3.420e+02, 1.810e+02,\n", " 1.000e+02, 5.700e+01, 4.500e+01, 1.900e+01, 1.000e+01, 9.000e+00,\n", " 2.000e+00, 3.000e+00, 2.000e+00, 5.000e+00, 0.000e+00, 1.000e+00,\n", " 1.000e+00, 2.000e+00]),\n", " array([ 0. , 6.05, 12.1 , 18.15, 24.2 , 30.25, 36.3 , 42.35,\n", " 48.4 , 54.45, 60.5 , 66.55, 72.6 , 78.65, 84.7 , 90.75,\n", " 96.8 , 102.85, 108.9 , 114.95, 121. ]),\n", " )" ] }, "metadata": {}, "execution_count": 13 }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjAAAAGdCAYAAAAMm0nCAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAArrElEQVR4nO3df1iVdZ7/8ReC5/jzHEKDAysaZZOSaIaFZyu3RgY0+rXZ7lim7GR56WKb0iiyY2a1E6bblP3Sbdsd2mt0UvfKpuBSQ0zcCn9EMf5KphoabPRAk3GOogLC/f2jL3eexAYMPHzw+biu+7o49+d9Pud9f66S13Wf+74JsyzLEgAAgEF6hLoBAACA9iLAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMExHqBjpLc3OzDh06pP79+yssLCzU7QAAgDawLEtHjx5VXFycevQ4+3mWbhtgDh06pPj4+FC3AQAAzsHBgwc1aNCgs4532wDTv39/Sd8sgMvlCnE3AACgLQKBgOLj4+3f42fTbQNMy9dGLpeLAAMAgGH+2uUfXMQLAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYJyIUDdgoksWFHba3J8vyei0uQEA6C7adQZmxYoVGjlypFwul1wul7xerzZs2GCP33jjjQoLCwvaZs6cGTRHVVWVMjIy1KdPH0VHR2vevHk6depUUM3WrVt19dVXy+l0aujQocrPzz/3IwQAAN1Ou87ADBo0SEuWLNHll18uy7L06quv6vbbb9dHH32kK6+8UpL0wAMP6PHHH7ff06dPH/vnpqYmZWRkyOPx6P3339fhw4c1bdo09ezZU08++aQkqbKyUhkZGZo5c6ZWrVql4uJi3X///YqNjVV6enpHHDMAADBcmGVZ1g+ZICoqSsuWLdP06dN144036qqrrtKzzz7bau2GDRt0yy236NChQ4qJiZEkrVy5Ujk5Ofryyy/lcDiUk5OjwsJC7d27137f5MmTVVtbq40bN7a5r0AgILfbLb/fL5fL9UMO8Qx8hQQAQOdo6+/vc76It6mpSa+99prq6urk9Xrt/atWrdLAgQM1YsQI5ebm6vjx4/ZYaWmpkpKS7PAiSenp6QoEAtq3b59dk5qaGvRZ6enpKi0tPddWAQBAN9Pui3j37Nkjr9erkydPql+/flq/fr0SExMlSffcc4+GDBmiuLg47d69Wzk5OaqoqNDrr78uSfL5fEHhRZL92ufzfW9NIBDQiRMn1Lt371b7qq+vV319vf06EAi099AAAIAh2h1grrjiCpWXl8vv9+t///d/lZmZqZKSEiUmJmrGjBl2XVJSkmJjYzV+/Hh99tlnuuyyyzq08e/Ky8vTY4891qmfAQAAuoZ2f4XkcDg0dOhQJScnKy8vT6NGjdLy5ctbrU1JSZEkffrpp5Ikj8ej6urqoJqW1x6P53trXC7XWc++SFJubq78fr+9HTx4sL2HBgAADPGDH2TX3Nwc9NXN6crLyyVJsbGxkiSv16s9e/aopqbGrikqKpLL5bK/hvJ6vSouLg6ap6ioKOg6m9Y4nU779u6WDQAAdE/t+gopNzdXEydO1ODBg3X06FGtXr1aW7du1aZNm/TZZ59p9erVuvnmmzVgwADt3r1bc+fO1bhx4zRy5EhJUlpamhITEzV16lQtXbpUPp9PCxcuVFZWlpxOpyRp5syZeuGFFzR//nzdd9992rJli9auXavCws678wcAAJilXQGmpqZG06ZN0+HDh+V2uzVy5Eht2rRJP/nJT3Tw4EFt3rxZzz77rOrq6hQfH69JkyZp4cKF9vvDw8NVUFCgWbNmyev1qm/fvsrMzAx6bkxCQoIKCws1d+5cLV++XIMGDdIrr7zCM2AAAIDtBz8HpqviOTAAAJin058DAwAAECoEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjtCvArFixQiNHjpTL5ZLL5ZLX69WGDRvs8ZMnTyorK0sDBgxQv379NGnSJFVXVwfNUVVVpYyMDPXp00fR0dGaN2+eTp06FVSzdetWXX311XI6nRo6dKjy8/PP/QgBAEC3064AM2jQIC1ZskRlZWX64IMP9OMf/1i333679u3bJ0maO3eu3nrrLa1bt04lJSU6dOiQ7rzzTvv9TU1NysjIUENDg95//329+uqrys/P16JFi+yayspKZWRk6KabblJ5ebnmzJmj+++/X5s2beqgQwYAAKYLsyzL+iETREVFadmyZbrrrrt08cUXa/Xq1brrrrskSQcOHNDw4cNVWlqqsWPHasOGDbrlllt06NAhxcTESJJWrlypnJwcffnll3I4HMrJyVFhYaH27t1rf8bkyZNVW1urjRs3trmvQCAgt9stv98vl8v1Qw7xDJcsKOzQ+U73+ZKMTpsbAICurq2/v8/5Gpimpia99tprqqurk9frVVlZmRobG5WammrXDBs2TIMHD1ZpaakkqbS0VElJSXZ4kaT09HQFAgH7LE5paWnQHC01LXOcTX19vQKBQNAGAAC6p3YHmD179qhfv35yOp2aOXOm1q9fr8TERPl8PjkcDkVGRgbVx8TEyOfzSZJ8Pl9QeGkZbxn7vppAIKATJ06cta+8vDy53W57i4+Pb++hAQAAQ7Q7wFxxxRUqLy/Xjh07NGvWLGVmZmr//v2d0Vu75Obmyu/329vBgwdD3RIAAOgkEe19g8Ph0NChQyVJycnJ2rVrl5YvX66f/vSnamhoUG1tbdBZmOrqank8HkmSx+PRzp07g+ZruUvp9Jrv3rlUXV0tl8ul3r17n7Uvp9Mpp9PZ3sMBAAAG+sHPgWlublZ9fb2Sk5PVs2dPFRcX22MVFRWqqqqS1+uVJHm9Xu3Zs0c1NTV2TVFRkVwulxITE+2a0+doqWmZAwAAoF1nYHJzczVx4kQNHjxYR48e1erVq7V161Zt2rRJbrdb06dPV3Z2tqKiouRyufTggw/K6/Vq7NixkqS0tDQlJiZq6tSpWrp0qXw+nxYuXKisrCz77MnMmTP1wgsvaP78+brvvvu0ZcsWrV27VoWFnXfnDwAAMEu7AkxNTY2mTZumw4cPy+12a+TIkdq0aZN+8pOfSJKeeeYZ9ejRQ5MmTVJ9fb3S09P10ksv2e8PDw9XQUGBZs2aJa/Xq759+yozM1OPP/64XZOQkKDCwkLNnTtXy5cv16BBg/TKK68oPT29gw4ZAACY7gc/B6ar4jkwAACYp9OfAwMAABAqBBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxmlXgMnLy9M111yj/v37Kzo6WnfccYcqKiqCam688UaFhYUFbTNnzgyqqaqqUkZGhvr06aPo6GjNmzdPp06dCqrZunWrrr76ajmdTg0dOlT5+fnndoQAAKDbaVeAKSkpUVZWlrZv366ioiI1NjYqLS1NdXV1QXUPPPCADh8+bG9Lly61x5qampSRkaGGhga9//77evXVV5Wfn69FixbZNZWVlcrIyNBNN92k8vJyzZkzR/fff782bdr0Aw8XAAB0BxHtKd64cWPQ6/z8fEVHR6usrEzjxo2z9/fp00cej6fVOd5++23t379fmzdvVkxMjK666io98cQTysnJ0eLFi+VwOLRy5UolJCTo6aefliQNHz5c7777rp555hmlp6e39xgBAEA384OugfH7/ZKkqKiooP2rVq3SwIEDNWLECOXm5ur48eP2WGlpqZKSkhQTE2PvS09PVyAQ0L59++ya1NTUoDnT09NVWlr6Q9oFAADdRLvOwJyuublZc+bM0XXXXacRI0bY+++55x4NGTJEcXFx2r17t3JyclRRUaHXX39dkuTz+YLCiyT7tc/n+96aQCCgEydOqHfv3mf0U19fr/r6evt1IBA410MDAABd3DkHmKysLO3du1fvvvtu0P4ZM2bYPyclJSk2Nlbjx4/XZ599pssuu+zcO/0r8vLy9Nhjj3Xa/AAAoOs4p6+QZs+erYKCAr3zzjsaNGjQ99ampKRIkj799FNJksfjUXV1dVBNy+uW62bOVuNyuVo9+yJJubm58vv99nbw4MH2HxgAADBCuwKMZVmaPXu21q9fry1btighIeGvvqe8vFySFBsbK0nyer3as2ePampq7JqioiK5XC4lJibaNcXFxUHzFBUVyev1nvVznE6nXC5X0AYAALqndgWYrKws/eY3v9Hq1avVv39/+Xw++Xw+nThxQpL02Wef6YknnlBZWZk+//xzvfnmm5o2bZrGjRunkSNHSpLS0tKUmJioqVOn6ve//702bdqkhQsXKisrS06nU5I0c+ZM/fGPf9T8+fN14MABvfTSS1q7dq3mzp3bwYcPAABM1K4As2LFCvn9ft14442KjY21tzVr1kiSHA6HNm/erLS0NA0bNkwPP/ywJk2apLfeesueIzw8XAUFBQoPD5fX69W9996radOm6fHHH7drEhISVFhYqKKiIo0aNUpPP/20XnnlFW6hBgAAkqQwy7KsUDfRGQKBgNxut/x+f4d/nXTJgsIOne90ny/J6LS5AQDo6tr6+5u/hQQAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgnIhQN4Bglywo7JR5P1+S0SnzAgAQCu06A5OXl6drrrlG/fv3V3R0tO644w5VVFQE1Zw8eVJZWVkaMGCA+vXrp0mTJqm6ujqopqqqShkZGerTp4+io6M1b948nTp1Kqhm69atuvrqq+V0OjV06FDl5+ef2xECAIBup10BpqSkRFlZWdq+fbuKiorU2NiotLQ01dXV2TVz587VW2+9pXXr1qmkpESHDh3SnXfeaY83NTUpIyNDDQ0Nev/99/Xqq68qPz9fixYtsmsqKyuVkZGhm266SeXl5ZozZ47uv/9+bdq0qQMOGQAAmC7MsizrXN/85ZdfKjo6WiUlJRo3bpz8fr8uvvhirV69WnfddZck6cCBAxo+fLhKS0s1duxYbdiwQbfccosOHTqkmJgYSdLKlSuVk5OjL7/8Ug6HQzk5OSosLNTevXvtz5o8ebJqa2u1cePGNvUWCATkdrvl9/vlcrnO9RBb1Vlf83QmvkICAJigrb+/f9BFvH6/X5IUFRUlSSorK1NjY6NSU1PtmmHDhmnw4MEqLS2VJJWWliopKckOL5KUnp6uQCCgffv22TWnz9FS0zJHa+rr6xUIBII2AADQPZ1zgGlubtacOXN03XXXacSIEZIkn88nh8OhyMjIoNqYmBj5fD675vTw0jLeMvZ9NYFAQCdOnGi1n7y8PLndbnuLj48/10MDAABd3DkHmKysLO3du1evvfZaR/ZzznJzc+X3++3t4MGDoW4JAAB0knO6jXr27NkqKCjQtm3bNGjQIHu/x+NRQ0ODamtrg87CVFdXy+Px2DU7d+4Mmq/lLqXTa75751J1dbVcLpd69+7dak9Op1NOp/NcDgcAABimXWdgLMvS7NmztX79em3ZskUJCQlB48nJyerZs6eKi4vtfRUVFaqqqpLX65Ukeb1e7dmzRzU1NXZNUVGRXC6XEhMT7ZrT52ipaZkDAABc2Np1BiYrK0urV6/W7373O/Xv39++ZsXtdqt3795yu92aPn26srOzFRUVJZfLpQcffFBer1djx46VJKWlpSkxMVFTp07V0qVL5fP5tHDhQmVlZdlnUGbOnKkXXnhB8+fP13333actW7Zo7dq1Kiw07+4fAADQ8dp1BmbFihXy+/268cYbFRsba29r1qyxa5555hndcsstmjRpksaNGyePx6PXX3/dHg8PD1dBQYHCw8Pl9Xp17733atq0aXr88cftmoSEBBUWFqqoqEijRo3S008/rVdeeUXp6ekdcMgAAMB0P+g5MF0Zz4EJxnNgAAAmOC/PgQEAAAgFAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcdodYLZt26Zbb71VcXFxCgsL0xtvvBE0/k//9E8KCwsL2iZMmBBUc+TIEU2ZMkUul0uRkZGaPn26jh07FlSze/du3XDDDerVq5fi4+O1dOnS9h8dAADoltodYOrq6jRq1Ci9+OKLZ62ZMGGCDh8+bG+//e1vg8anTJmiffv2qaioSAUFBdq2bZtmzJhhjwcCAaWlpWnIkCEqKyvTsmXLtHjxYr388svtbRcAAHRDEe19w8SJEzVx4sTvrXE6nfJ4PK2Offzxx9q4caN27dqlMWPGSJKef/553Xzzzfr3f/93xcXFadWqVWpoaNB///d/y+Fw6Morr1R5ebl+9atfBQUdAABwYeqUa2C2bt2q6OhoXXHFFZo1a5a++uore6y0tFSRkZF2eJGk1NRU9ejRQzt27LBrxo0bJ4fDYdekp6eroqJCX3/9daufWV9fr0AgELQBAIDuqcMDzIQJE/Q///M/Ki4u1lNPPaWSkhJNnDhRTU1NkiSfz6fo6Oig90RERCgqKko+n8+uiYmJCapped1S8115eXlyu932Fh8f39GHBgAAuoh2f4X010yePNn+OSkpSSNHjtRll12mrVu3avz48R39cbbc3FxlZ2fbrwOBACEGAIBuqtNvo7700ks1cOBAffrpp5Ikj8ejmpqaoJpTp07pyJEj9nUzHo9H1dXVQTUtr892bY3T6ZTL5QraAABA99TpAeaLL77QV199pdjYWEmS1+tVbW2tysrK7JotW7aoublZKSkpds22bdvU2Nho1xQVFemKK67QRRdd1NktAwCALq7dAebYsWMqLy9XeXm5JKmyslLl5eWqqqrSsWPHNG/ePG3fvl2ff/65iouLdfvtt2vo0KFKT0+XJA0fPlwTJkzQAw88oJ07d+q9997T7NmzNXnyZMXFxUmS7rnnHjkcDk2fPl379u3TmjVrtHz58qCviAAAwIWr3QHmgw8+0OjRozV69GhJUnZ2tkaPHq1FixYpPDxcu3fv1m233aYf/ehHmj59upKTk/V///d/cjqd9hyrVq3SsGHDNH78eN188826/vrrg57x4na79fbbb6uyslLJycl6+OGHtWjRIm6hBgAAkqQwy7KsUDfRGQKBgNxut/x+f4dfD3PJgsIOne98+HxJRqhbAADgr2rr72/+FhIAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGCci1A3g/LhkQWGnzf35koxOmxsAgNZwBgYAABiHAAMAAIxDgAEAAMZpd4DZtm2bbr31VsXFxSksLExvvPFG0LhlWVq0aJFiY2PVu3dvpaam6pNPPgmqOXLkiKZMmSKXy6XIyEhNnz5dx44dC6rZvXu3brjhBvXq1Uvx8fFaunRp+48OAAB0S+0OMHV1dRo1apRefPHFVseXLl2q5557TitXrtSOHTvUt29fpaen6+TJk3bNlClTtG/fPhUVFamgoEDbtm3TjBkz7PFAIKC0tDQNGTJEZWVlWrZsmRYvXqyXX375HA4RAAB0N2GWZVnn/OawMK1fv1533HGHpG/OvsTFxenhhx/Wz3/+c0mS3+9XTEyM8vPzNXnyZH388cdKTEzUrl27NGbMGEnSxo0bdfPNN+uLL75QXFycVqxYoV/84hfy+XxyOBySpAULFuiNN97QgQMH2tRbIBCQ2+2W3++Xy+U610NsVWfe0WMi7kICAHSUtv7+7tBrYCorK+Xz+ZSammrvc7vdSklJUWlpqSSptLRUkZGRdniRpNTUVPXo0UM7duywa8aNG2eHF0lKT09XRUWFvv7661Y/u76+XoFAIGgDAADdU4cGGJ/PJ0mKiYkJ2h8TE2OP+Xw+RUdHB41HREQoKioqqKa1OU7/jO/Ky8uT2+22t/j4+B9+QAAAoEvqNnch5ebmyu/329vBgwdD3RIAAOgkHRpgPB6PJKm6ujpof3V1tT3m8XhUU1MTNH7q1CkdOXIkqKa1OU7/jO9yOp1yuVxBGwAA6J46NMAkJCTI4/GouLjY3hcIBLRjxw55vV5JktfrVW1trcrKyuyaLVu2qLm5WSkpKXbNtm3b1NjYaNcUFRXpiiuu0EUXXdSRLQMAAAO1O8AcO3ZM5eXlKi8vl/TNhbvl5eWqqqpSWFiY5syZo3/7t3/Tm2++qT179mjatGmKi4uz71QaPny4JkyYoAceeEA7d+7Ue++9p9mzZ2vy5MmKi4uTJN1zzz1yOByaPn269u3bpzVr1mj58uXKzs7usAMHAADmavcfc/zggw9000032a9bQkVmZqby8/M1f/581dXVacaMGaqtrdX111+vjRs3qlevXvZ7Vq1apdmzZ2v8+PHq0aOHJk2apOeee84ed7vdevvtt5WVlaXk5GQNHDhQixYtCnpWDAAAuHD9oOfAdGU8B+b84TkwAICOEpLnwAAAAJwPBBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcQgwAADAOAQYAABgHAIMAAAwDgEGAAAYhwADAACMQ4ABAADGIcAAAADjEGAAAIBxCDAAAMA4BBgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYJyIUDcA812yoLDT5v58SUanzQ0AMBdnYAAAgHEIMAAAwDgEGAAAYBwCDAAAME6HB5jFixcrLCwsaBs2bJg9fvLkSWVlZWnAgAHq16+fJk2apOrq6qA5qqqqlJGRoT59+ig6Olrz5s3TqVOnOrpVAABgqE65C+nKK6/U5s2bv/2QiG8/Zu7cuSosLNS6devkdrs1e/Zs3XnnnXrvvfckSU1NTcrIyJDH49H777+vw4cPa9q0aerZs6eefPLJzmgXAAAYplMCTEREhDwezxn7/X6//uu//kurV6/Wj3/8Y0nSr3/9aw0fPlzbt2/X2LFj9fbbb2v//v3avHmzYmJidNVVV+mJJ55QTk6OFi9eLIfD0RktAwAAg3TKNTCffPKJ4uLidOmll2rKlCmqqqqSJJWVlamxsVGpqal27bBhwzR48GCVlpZKkkpLS5WUlKSYmBi7Jj09XYFAQPv27TvrZ9bX1ysQCARtAACge+rwAJOSkqL8/Hxt3LhRK1asUGVlpW644QYdPXpUPp9PDodDkZGRQe+JiYmRz+eTJPl8vqDw0jLeMnY2eXl5crvd9hYfH9+xBwYAALqMDv8KaeLEifbPI0eOVEpKioYMGaK1a9eqd+/eHf1xttzcXGVnZ9uvA4EAIQYAgG6q02+jjoyM1I9+9CN9+umn8ng8amhoUG1tbVBNdXW1fc2Mx+M5466kltetXVfTwul0yuVyBW0AAKB76vQAc+zYMX322WeKjY1VcnKyevbsqeLiYnu8oqJCVVVV8nq9kiSv16s9e/aopqbGrikqKpLL5VJiYmJntwsAAAzQ4V8h/fznP9ett96qIUOG6NChQ3r00UcVHh6uu+++W263W9OnT1d2draioqLkcrn04IMPyuv1auzYsZKktLQ0JSYmaurUqVq6dKl8Pp8WLlyorKwsOZ3Ojm4XAAAYqMMDzBdffKG7775bX331lS6++GJdf/312r59uy6++GJJ0jPPPKMePXpo0qRJqq+vV3p6ul566SX7/eHh4SooKNCsWbPk9XrVt29fZWZm6vHHH+/oVgEAgKHCLMuyQt1EZwgEAnK73fL7/R1+PcwlCwo7dD6c3edLMkLdAgDgPGrr72/+FhIAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYJwOfxIv0JE666GBPCAPAMzGGRgAAGAcAgwAADAOAQYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcSJC3QAQCpcsKOy0uT9fktFpcwMAvsEZGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4xBgAACAcXgODNDBOusZMzxfBgC+xRkYAABgHAIMAAAwDgEGAAAYhwADAACMw0W8gCH4A5QA8K0uHWBefPFFLVu2TD6fT6NGjdLzzz+va6+9NtRtAd0Od04BME2X/QppzZo1ys7O1qOPPqoPP/xQo0aNUnp6umpqakLdGgAACLEwy7KsUDfRmpSUFF1zzTV64YUXJEnNzc2Kj4/Xgw8+qAULFvzV9wcCAbndbvn9frlcrg7trTNP5QNoG87uAN1TW39/d8mvkBoaGlRWVqbc3Fx7X48ePZSamqrS0tJW31NfX6/6+nr7td/vl/TNQnS05vrjHT4ngPYZPHddqFtot72PpYe6BaDLa/m9/dfOr3TJAPOXv/xFTU1NiomJCdofExOjAwcOtPqevLw8PfbYY2fsj4+P75QeAaC93M+GugPAHEePHpXb7T7reJcMMOciNzdX2dnZ9uvm5mYdOXJEAwYMUFhYWId9TiAQUHx8vA4ePNjhX011N6xV27BObcdatR1r1TasU9udr7WyLEtHjx5VXFzc99Z1yQAzcOBAhYeHq7q6Omh/dXW1PB5Pq+9xOp1yOp1B+yIjIzurRblcLv5jbyPWqm1Yp7ZjrdqOtWob1qntzsdafd+ZlxZd8i4kh8Oh5ORkFRcX2/uam5tVXFwsr9cbws4AAEBX0CXPwEhSdna2MjMzNWbMGF177bV69tlnVVdXp5/97Gehbg0AAIRYlw0wP/3pT/Xll19q0aJF8vl8uuqqq7Rx48YzLuw935xOpx599NEzvq7CmVirtmGd2o61ajvWqm1Yp7bramvVZZ8DAwAAcDZd8hoYAACA70OAAQAAxiHAAAAA4xBgAACAcQgw7fTiiy/qkksuUa9evZSSkqKdO3eGuqWQysvL0zXXXKP+/fsrOjpad9xxhyoqKoJqTp48qaysLA0YMED9+vXTpEmTznhI4YVmyZIlCgsL05w5c+x9rNO3/vznP+vee+/VgAED1Lt3byUlJemDDz6wxy3L0qJFixQbG6vevXsrNTVVn3zySQg7Do2mpiY98sgjSkhIUO/evXXZZZfpiSeeCPobMhfiWm3btk233nqr4uLiFBYWpjfeeCNovC1rcuTIEU2ZMkUul0uRkZGaPn26jh07dh6P4vz4vrVqbGxUTk6OkpKS1LdvX8XFxWnatGk6dOhQ0ByhWisCTDusWbNG2dnZevTRR/Xhhx9q1KhRSk9PV01NTahbC5mSkhJlZWVp+/btKioqUmNjo9LS0lRXV2fXzJ07V2+99ZbWrVunkpISHTp0SHfeeWcIuw6tXbt26T/+4z80cuTIoP2s0ze+/vprXXfdderZs6c2bNig/fv36+mnn9ZFF11k1yxdulTPPfecVq5cqR07dqhv375KT0/XyZMnQ9j5+ffUU09pxYoVeuGFF/Txxx/rqaee0tKlS/X888/bNRfiWtXV1WnUqFF68cUXWx1vy5pMmTJF+/btU1FRkQoKCrRt2zbNmDHjfB3CefN9a3X8+HF9+OGHeuSRR/Thhx/q9ddfV0VFhW677bagupCtlYU2u/baa62srCz7dVNTkxUXF2fl5eWFsKuupaamxpJklZSUWJZlWbW1tVbPnj2tdevW2TUff/yxJckqLS0NVZshc/ToUevyyy+3ioqKrL/7u7+zHnroIcuyWKfT5eTkWNdff/1Zx5ubmy2Px2MtW7bM3ldbW2s5nU7rt7/97floscvIyMiw7rvvvqB9d955pzVlyhTLslgry7IsSdb69evt121Zk/3791uSrF27dtk1GzZssMLCwqw///nP56338+27a9WanTt3WpKsP/3pT5ZlhXatOAPTRg0NDSorK1Nqaqq9r0ePHkpNTVVpaWkIO+ta/H6/JCkqKkqSVFZWpsbGxqB1GzZsmAYPHnxBrltWVpYyMjKC1kNinU735ptvasyYMfqHf/gHRUdHa/To0frP//xPe7yyslI+ny9ordxut1JSUi64tfrbv/1bFRcX6w9/+IMk6fe//73effddTZw4URJr1Zq2rElpaakiIyM1ZswYuyY1NVU9evTQjh07znvPXYnf71dYWJj9twZDuVZd9km8Xc1f/vIXNTU1nfEk4JiYGB04cCBEXXUtzc3NmjNnjq677jqNGDFCkuTz+eRwOM74w5oxMTHy+Xwh6DJ0XnvtNX344YfatWvXGWOs07f++Mc/asWKFcrOzta//uu/ateuXfqXf/kXORwOZWZm2uvR2v+LF9paLViwQIFAQMOGDVN4eLiampr0y1/+UlOmTJEk1qoVbVkTn8+n6OjooPGIiAhFRUVdsOsmfXOdXk5Oju6++277jzmGcq0IMOgwWVlZ2rt3r959991Qt9LlHDx4UA899JCKiorUq1evULfTpTU3N2vMmDF68sknJUmjR4/W3r17tXLlSmVmZoa4u65l7dq1WrVqlVavXq0rr7xS5eXlmjNnjuLi4lgrdKjGxkb94z/+oyzL0ooVK0LdjiQu4m2zgQMHKjw8/Iy7Qqqrq+XxeELUVdcxe/ZsFRQU6J133tGgQYPs/R6PRw0NDaqtrQ2qv9DWraysTDU1Nbr66qsVERGhiIgIlZSU6LnnnlNERIRiYmJYp/8vNjZWiYmJQfuGDx+uqqoqSbLXg/8XpXnz5mnBggWaPHmykpKSNHXqVM2dO1d5eXmSWKvWtGVNPB7PGTdnnDp1SkeOHLkg160lvPzpT39SUVGRffZFCu1aEWDayOFwKDk5WcXFxfa+5uZmFRcXy+v1hrCz0LIsS7Nnz9b69eu1ZcsWJSQkBI0nJyerZ8+eQetWUVGhqqqqC2rdxo8frz179qi8vNzexowZoylTptg/s07fuO666864Ff8Pf/iDhgwZIklKSEiQx+MJWqtAIKAdO3ZccGt1/Phx9egR/M94eHi4mpubJbFWrWnLmni9XtXW1qqsrMyu2bJli5qbm5WSknLeew6llvDyySefaPPmzRowYEDQeEjXqlMvEe5mXnvtNcvpdFr5+fnW/v37rRkzZliRkZGWz+cLdWshM2vWLMvtdltbt261Dh8+bG/Hjx+3a2bOnGkNHjzY2rJli/XBBx9YXq/X8nq9Iey6azj9LiTLYp1a7Ny504qIiLB++ctfWp988om1atUqq0+fPtZvfvMbu2bJkiVWZGSk9bvf/c7avXu3dfvtt1sJCQnWiRMnQtj5+ZeZmWn9zd/8jVVQUGBVVlZar7/+ujVw4EBr/vz5ds2FuFZHjx61PvroI+ujjz6yJFm/+tWvrI8++si+c6YtazJhwgRr9OjR1o4dO6x3333Xuvzyy6277747VIfUab5vrRoaGqzbbrvNGjRokFVeXh70b3x9fb09R6jWigDTTs8//7w1ePBgy+FwWNdee621ffv2ULcUUpJa3X7961/bNSdOnLD++Z//2brooousPn36WH//939vHT58OHRNdxHfDTCs07feeusta8SIEZbT6bSGDRtmvfzyy0Hjzc3N1iOPPGLFxMRYTqfTGj9+vFVRURGibkMnEAhYDz30kDV48GCrV69e1qWXXmr94he/CPrlciGu1TvvvNPqv0uZmZmWZbVtTb766ivr7rvvtvr162e5XC7rZz/7mXX06NEQHE3n+r61qqysPOu/8e+88449R6jWKsyyTntkIwAAgAG4BgYAABiHAAMAAIxDgAEAAMYhwAAAAOMQYAAAgHEIMAAAwDgEGAAAYBwCDAAAMA4BBgAAGIcAAwAAjEOAAQAAxiHAAAAA4/w/b+aajA7AWHQAAAAASUVORK5CYII=\n" }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "## Initialize TF-IDF" ], "metadata": { "id": "pykJXbUB7-t4" } }, { "cell_type": "code", "source": [ "tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=5024) # initializing tf-idf\n", "x_tfidf = tfidf.fit_transform(df['Normalized and StopWords question']).toarray() # oversimplifying this converts words to vectors\n", "\n", "features_tfidf = tfidf.get_feature_names_out() # use function to get all the normalized words\n", "df_tfidf = pd.DataFrame(x_tfidf, columns = features_tfidf) # create dataframe to show the 0, 1 value for each word\n", "# df_tfidf.loc[:,['hello', 'rick', 'morty']].head() # show only specific columns\n", "df_tfidf.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 290 }, "id": "Zofq8bftN0Jh", "outputId": "2a6f55e6-6bd3-42e3-8fe4-91319e51d3ae" }, "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " able able to about about be about him about how about it about me \\\n", "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", " about my about ross ... youre have youre not youre not gon \\\n", "0 0.0 0.0 ... 0.0 0.0 0.0 \n", "1 0.0 0.0 ... 0.0 0.0 0.0 \n", "2 0.0 0.0 ... 0.0 0.0 0.0 \n", "3 0.0 0.0 ... 0.0 0.0 0.0 \n", "4 0.0 0.0 ... 0.0 0.0 0.0 \n", "\n", " youre right youre such youre the yours yourself youve youve get \n", "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", "[5 rows x 5024 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ableable toaboutabout beabout himabout howabout itabout meabout myabout ross...youre haveyoure notyoure not gonyoure rightyoure suchyoure theyoursyourselfyouveyouve get
00.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
10.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
40.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", "

5 rows × 5024 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df_tfidf" } }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "markdown", "source": [ "## Create tf-df function to find most relevant answers to the question" ], "metadata": { "id": "G-mwbXlIE8Cr" } }, { "cell_type": "code", "source": [ "def chat_tfidf(question):\n", " tidy_question = text_normalization(removeStopWords(question)) # clean & lemmatize the question\n", " tf = tfidf.transform([tidy_question]).toarray() # convert the question into a vector\n", " cos = 1- pairwise_distances(df_tfidf, tf, metric = 'cosine') # calculate the cosine value\n", " index_value = cos.argmax() # find the index of the maximum cosine value\n", " # answer = Answer(\"Ross\", df['answer'].loc[index_value])\n", " answer = df['answer'].loc[index_value]\n", " return answer" ], "metadata": { "id": "txVimZGTN2rt" }, "execution_count": 15, "outputs": [] }, { "cell_type": "code", "source": [ "# \"what?! give-give me a brush.\"\n", "print(chat_tfidf(\"what?! give-give me a brush.\"))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1RJ5Roec1A4p", "outputId": "12ed9523-29f6-456d-bd7a-4d4ccbdd61f7" }, "execution_count": 16, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "gimme the keys!\n" ] } ] }, { "cell_type": "markdown", "source": [ "How to take in account context. I will do this with context vector.Length of context is 3 sentences. weight vector is [0.1, 0.5, 1]. 3rd sentence is our last message to the bot" ], "metadata": { "id": "oZyBogf1sW2F" } }, { "cell_type": "code", "source": [ "def chat_tfidf_context(question, history):\n", "\n", " len_history = len(history)\n", "\n", " if len_history > 1:\n", " memory_weights = np.array([0.1, 0.3, 1.0]) # .reshape((3,1))\n", " # take last two sentences in accordance to bot's memory\n", " history = history[-2:]\n", "\n", " else:\n", " memory_weights = np.array([0.3, 1.0])\n", "\n", " history_sentence = np.zeros(shape=(len_history+1, 5024))\n", "\n", " for ind, h in enumerate(history):\n", " # normalize first question from context\n", " tidy_question = text_normalization(removeStopWords(h[0]))\n", " # pass via tfidf\n", " tf = tfidf.transform([tidy_question]).toarray()\n", "\n", " # assign tf idf vector to history sentence\n", " history_sentence[ind] = tf * memory_weights[ind]\n", "\n", " tidy_question = text_normalization(removeStopWords(question))\n", " tf = tfidf.transform([tidy_question]).toarray()\n", "\n", " history_sentence[-1] = tf\n", " history_sentence = history_sentence.mean(axis=0).reshape(1,-1)\n", "\n", " cos = 1- pairwise_distances(df_tfidf, history_sentence, metric = 'cosine')\n", " index_value = cos.argmax()\n", " answer = df['answer'].loc[index_value]\n", "\n", " return answer\n" ], "metadata": { "id": "1gSf5KHNuMTC" }, "execution_count": 17, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Check how it works" ], "metadata": { "id": "qi_TuVUqBxI-" } }, { "cell_type": "code", "source": [ "chat_tfidf_context(question=\"no way!\", history=[[\"what?! give-give me a brush.\", \"gimme the keys!\"]])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 36 }, "id": "CxbzNXHP0z-G", "outputId": "7cb79973-86af-41dc-9756-c10cb157b41c" }, "execution_count": 18, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'well no brush!'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 18 } ] }, { "cell_type": "markdown", "source": [ "### you can use these question and answers to check the chatbot" ], "metadata": { "id": "EpTq8u8d6hzU" } }, { "cell_type": "code", "source": [ "print(df['question'].iloc[[14,75,23,94]])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "FGL9i5PR5x2Z", "outputId": "389279cc-07b2-4d5d-8aa4-77d6a76979d9" }, "execution_count": 19, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "14 with you?! yeah right!\n", "75 uh, hey!\n", "23 no rach! come on! no-no! yeah, i’m sure we won...\n", "94 you have no idea what a nightmare this has bee...\n", "Name: question, dtype: object\n" ] } ] }, { "cell_type": "code", "source": [ "print(df['answer'].iloc[[14,75,23,94]])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Hf0ehhMH6BIF", "outputId": "ccd1910b-7ecd-41ba-b779-1af4ee3be154" }, "execution_count": 20, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "14 all right.\n", "75 whats going on?\n", "23 hi officer, was i going a little too fast?\n", "94 oh yeah, really? is it ross? yeah? okay, well ...\n", "Name: answer, dtype: object\n" ] } ] }, { "cell_type": "markdown", "source": [ "### In this HW i don't use fasttext because of bad quality of vectorization" ], "metadata": { "id": "T00KysBUNpmb" } }, { "cell_type": "code", "source": [ "from string import punctuation\n", "from tqdm.auto import tqdm, trange" ], "metadata": { "id": "M7sJW1DSHqFg" }, "execution_count": 21, "outputs": [] }, { "cell_type": "code", "source": [ "punkt = [p for p in punctuation] + [\"`\", \"``\" ,\"''\", \"'\"]\n", "for p in punkt:\n", " print(p, end=\" | \")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Kl_9Db5PIBGO", "outputId": "6f669123-1fe8-48f9-8329-8c3d902de30a" }, "execution_count": 22, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "! | \" | # | $ | % | & | ' | ( | ) | * | + | , | - | . | / | : | ; | < | = | > | ? | @ | [ | \\ | ] | ^ | _ | ` | { | | | } | ~ | ` | `` | '' | ' | " ] } ] }, { "cell_type": "code", "source": [ "def tokenize(sent: str) -> str:\n", " # text = str(sent).lower() # convert to all lower letters\n", " tokens = nltk.word_tokenize(sent.lower()) # tokenize words\n", " return ' '.join([word for word in tokens if word not in stop and word not in punkt])\n", "\n", "questions_preprocessed = []\n", "for question in df[\"question\"].tolist() + df[\"answer\"].tolist():\n", " questions_preprocessed.append(tokenize(question))" ], "metadata": { "id": "oUv2xJ15lW11" }, "execution_count": 23, "outputs": [] }, { "cell_type": "code", "source": [ "questions_preprocessed[:10]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9MjLbsXaT3VH", "outputId": "3fe0b3ee-6e03-4538-9145-e250bfe3f80e" }, "execution_count": 24, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['hey uh mon i saw the porsche parked out front can i get the keys thought i ’ d take that bad boy out for a little spin',\n", " 'saw the porsche out there mon lookin ’ good when do i get to take that baby out again',\n", " 'nice work everybody so much for the y ’ know you can drive it but don ’ t tell rachel plan',\n", " 'whew that was a brisk ride',\n", " 'only way to fly',\n", " 'you ’ re fast and irresponsible that adds up to a bad driver',\n", " 'did you see the look that girl just gave me huh she must ’ ve seen me cruising in the bad boy',\n", " 'what give-give me a brush',\n", " 'no way',\n", " 'fine y ’ know what it doesn ’ t matter because if i remember correctly there is a comb on the floor of the bathroom']" ] }, "metadata": {}, "execution_count": 24 } ] }, { "cell_type": "markdown", "source": [ "Обучим Word2vec на нашем корпусе текстов + добавим отдельный вектор для UNK, чтобы избежать вечной проблемы с поиском слов, которые отсутсвуют в тренировочной выборке." ], "metadata": { "id": "UxzMEfLUEDKL" } }, { "cell_type": "code", "source": [ "questions_w2v = [sent.split(\" \") for sent in questions_preprocessed]" ], "metadata": { "id": "1WFj3vUHD-lk" }, "execution_count": 25, "outputs": [] }, { "cell_type": "code", "source": [ "%%time\n", "from gensim.models import Word2Vec, KeyedVectors\n", "import gensim.downloader as api" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "m-GqvlHTFAWv", "outputId": "33730077-88bf-42e0-846b-db5d0fdd8d09" }, "execution_count": 26, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "CPU times: user 95.9 ms, sys: 19.6 ms, total: 115 ms\n", "Wall time: 347 ms\n" ] } ] }, { "cell_type": "code", "source": [ "# w2v = Word2Vec(sentences=questions_w2v, min_count=2, vector_size=50, window=6, seed=33, workers=4)\n", "w2v = api.load(\"glove-twitter-25\")" ], "metadata": { "id": "JgwSH3Y5SZeY", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "e323eb1b-eab8-447e-ad41-dc9b07a6a3a8" }, "execution_count": 27, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[==================================================] 100.0% 104.8/104.8MB downloaded\n" ] } ] }, { "cell_type": "code", "source": [ "w2v.save('w2v.bin')" ], "metadata": { "id": "ZIZHXBYQSEmE" }, "execution_count": 28, "outputs": [] }, { "cell_type": "code", "source": [ "w2v = KeyedVectors.load('w2v.bin')" ], "metadata": { "id": "Qm2L7wycSIO2" }, "execution_count": 29, "outputs": [] }, { "cell_type": "code", "source": [ "unknown_vector = np.random.uniform(low=-0.2, high=0.2, size=(25,))" ], "metadata": { "id": "xy-RUh2mhtV5" }, "execution_count": 30, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Define function which calculates vectorized sentences and for unknown words insert special vector" ], "metadata": { "id": "0Ps8syfWT4ka" } }, { "cell_type": "code", "source": [ "def w2v_get_vector_for_sentence(sentence):\n", " sent = nltk.word_tokenize(sentence.lower())\n", " sent = [word for word in sent if word not in punkt]\n", " sentence_vector = []\n", " if len(sent)==0:\n", " sentence_vector.append(unknown_vector)\n", " else:\n", " for word in sent:\n", " if word in w2v.key_to_index:\n", " sentence_vector.append(w2v[word])\n", " else:\n", " sentence_vector.append(unknown_vector)\n", "\n", " return np.array(sentence_vector).mean(axis=0)" ], "metadata": { "id": "EtLYYAWIg59k" }, "execution_count": 31, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Calculate the base with vectorized sentences for dataframe" ], "metadata": { "id": "FZ3pPzTvTxxD" } }, { "cell_type": "code", "source": [ "base = np.zeros(shape=(len(df.question), 25))\n", "for ind, sentence in enumerate(df['question']): # df[df['question'].str.len() >= 1]\n", " base[ind] = w2v_get_vector_for_sentence(sentence)" ], "metadata": { "id": "xXTq1Se9Y4Mp" }, "execution_count": 32, "outputs": [] }, { "cell_type": "code", "source": [ "def chat_word2vec(question):\n", " question = [w2v_get_vector_for_sentence(question)]\n", " cos = 1- pairwise_distances(base, question, metric = 'cosine') # calculate the cosine value\n", " index_value = cos.argmax() # find the index of the maximum cosine value\n", " answer = df['answer'].loc[index_value]\n", " return answer" ], "metadata": { "id": "djvPfcMTk2-0" }, "execution_count": 33, "outputs": [] }, { "cell_type": "code", "source": [ "def chat_word2vec_context(question, history):\n", "\n", " len_history = len(history)\n", "\n", " if len_history > 1:\n", " memory_weights = np.array([0.1, 0.3, 1.0]) # .reshape((3,1))\n", " # take last two sentences in accordance to bot's memory\n", " history = history[-2:]\n", "\n", " else:\n", " memory_weights = np.array([0.3, 1.0])\n", "\n", " history_sentence = np.zeros(shape=(len_history+1, 25))\n", "\n", " for ind, h in enumerate(history):\n", " sentence = w2v_get_vector_for_sentence(h[0])\n", " history_sentence[ind] = sentence * memory_weights[ind]\n", "\n", " question = w2v_get_vector_for_sentence(question)\n", "\n", " history_sentence[-1] = question\n", " history_sentence = history_sentence.mean(axis=0).reshape(1, -1)\n", "\n", " cos = 1- pairwise_distances(base, history_sentence, metric = 'cosine')\n", " index_value = cos.argmax()\n", " answer = df['answer'].loc[index_value]\n", "\n", " return answer" ], "metadata": { "id": "7YYz1vVWDMvt" }, "execution_count": 34, "outputs": [] }, { "cell_type": "code", "source": [ "df.loc[2, [\"question\", \"answer\"]]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "c2JKLac8NPZy", "outputId": "152a2f28-9ac2-488e-f742-e0e9ee0bee83" }, "execution_count": 35, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "question nice work everybody! so much for the y’know, y...\n", "answer wow! i can’t believe you lied to me.\n", "Name: 2, dtype: object" ] }, "metadata": {}, "execution_count": 35 } ] }, { "cell_type": "code", "source": [ "chat_word2vec_context(question=\"hey uh mon, i saw the porsche parked out front...\", history=[])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 54 }, "id": "gp1AQThMM4f7", "outputId": "dc953286-bfe5-46d2-d256-d997af957391" }, "execution_count": 36, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'wait a minute! you let ross drive the porsche and when i ask you, you say you’re the only one who’s allowed to drive it.'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 36 } ] }, { "cell_type": "code", "source": [ "chat_word2vec_context(question=\"saw the porsche out there mon, lookin’ good.\", history=[[\"hey uh mon, i saw the porsche parked out front...\", \"wait a minute! you let ross drive the porsche and when i ask you, you say you’re the only one who’s allowed to drive it.\"]])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 54 }, "id": "sez_ozg1END0", "outputId": "d2528e31-a8aa-484e-9664-f3842814e1d0" }, "execution_count": 37, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'wait a minute! you let ross drive the porsche and when i ask you, you say you’re the only one who’s allowed to drive it.'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 37 } ] }, { "cell_type": "markdown", "source": [], "metadata": { "id": "b0i3YbNmPGLI" } }, { "cell_type": "markdown", "source": [ "Теперь перейдем к контекстуализированным эмбеддингам ваших документов/текстов с использованием предобученных Энкодерных текстовых моделей. Для решения более 90% задач этот метод отлично подходит как одно из первых решений и будет если не первым, то уж вторым точно baseline` решением.\n", "\n", "Напомним, что BERT-like модели хороши тем, что помимо того, что они возвращают контекстуализированный эмбеддинг для каждого токена, у них еще есть представление всего текста, которое можно доставать из CLS токена (почти всегда он представлен на первом месте)\n", "\n", "Для данной задачи используем *elastic/multilingual-e5-small-optimized*" ], "metadata": { "id": "3qxYSUt_dd4J" } }, { "cell_type": "markdown", "source": [ "### Load libraries, set device and download model with tokenizer" ], "metadata": { "id": "rlmn1sioYCl-" } }, { "cell_type": "code", "source": [ "import torch\n", "from transformers import AutoTokenizer, AutoModel" ], "metadata": { "id": "AX9F17KodjyM" }, "execution_count": 38, "outputs": [] }, { "cell_type": "code", "source": [ "# model_name = \"elastic/multilingual-e5-small-optimized\"\n", "model_name = \"distilbert/distilbert-base-uncased\"\n", "device = \"cpu\"" ], "metadata": { "id": "aXHzrAh3o5gC" }, "execution_count": 39, "outputs": [] }, { "cell_type": "code", "source": [ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model = AutoModel.from_pretrained(model_name)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 306, "referenced_widgets": [ "59acb6a5958b4c198cb46454b7a6dec9", "5ac625801dea4629aecece1ff517264a", "55ca96d7fc30455bababc6bad1b7e1a5", "46deb03028054b14a5b15823fef6ee87", "0edf7dcbc72d4e58a38ae37f8254042f", "4772e836196a4a7b956964f9167b56ee", "56107b13ea3b4762bfdae3c4223423a4", "cc4656efe2154ad3a45df827db7f1545", "7667c315a9e349b1a68a569b4d139419", "56cf608a62a54e3bb6fa58a9b957d9b0", "c0c0aedf7a43453fab24711721427223", "41fe5c4b1b734bc4a41cbd1f2db00989", "fdd3f19839c040e5a4a466460b5a2480", "50d15161d5344de9b9cbeba28862f957", "f05643a308d649d8b990813b50cbc4aa", "62032165e7ab4bd393e096fc6221fcb7", "1bdeb4fed33243ea83bf98428f369db3", "f75043abf72e4739ba3bb86090215fc5", "fc1b15b082594ffa94ecf60f532c8a5c", "788452ff4de244dab85b8b8adc42b4ea", "59093618fafe46eea7a1e38bb20a9300", "3f6fb8acd7e74442a894330c6b4767a8", "0994304224d24789b23432c2f7de953a", "177edf091f7749dfb5ef3f84a6d92c75", "2429a72651864f7fa6c9a57eeccebea8", "cd7ce4a3424a475b9b4a7ea47a71e914", "f7ab5c61c83e4b029a9109e30acc13b4", "5adeff780b084deea141f38bf1549301", "d975f27f07904fdbbeb663b959c0e580", "bd059140d3f24b3381a96f2f4adca8c4", "88a2bcfb25e1420eb736839677e2753a", "3d28bb240d504e2cb20329a47583a1db", "d61521350b154468be12b2b2ce20b971", "fc4b486eed0a46d0b3dd6ee32a13ba48", "91bedc8101a44adbb9deb5dae6908c7a", "c2c80ee83cb14f7ba55b437bef36746f", "d4691144447c4c6e949e956d9cc2acbf", "1a90905ebad24062ae6aceef6596d923", "f94bc9cc61fa43e2ab811fae617eae53", "4dd54f9cb56f4765bae7ebbcbb8ab4e5", "5454a07844794801afaea942de3cda98", "cf071df905c5444286f15a9bf2b95828", "cd7813cb1eb845b8b85ce40c8658299a", "cb475a7759d94e3e9fb891136af83e2b", "b45024a0ec8747a2bd2677b3d8e10352", "483daaf25506490da97f687523439aaa", "98b5dd2edffe4921a0a080c98bdd4792", "1a41c82bf7504e8085bfd16d3bbf36a2", "ab95197ced5449b496f1221079b36908", "470ddaf005054b25b05125b2fa7fde4a", "65722a6fde5e4087834e7a741a315cae", "516dc0635a214f06997574c4f902de97", "dc72eb488b4547a38ac317f37261216a", "47cc743855a7459a90c127d3ab60b3a3", "065bf7a1f04945c38aca86a0f972aca9" ] }, "id": "9OcfJ3wWou_p", "outputId": "aa49b5c5-733e-4b6f-e249-287778957965" }, "execution_count": 40, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "tokenizer_config.json: 0%| | 0.00/28.0 [00:00 np.array:\n", " t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')\n", " with torch.no_grad():\n", " model_output = model(**{k: v.to(model.device) for k, v in t.items()})\n", " embeddings = model_output.last_hidden_state[:, 0, :]\n", " embeddings = torch.nn.functional.normalize(embeddings)\n", " return embeddings[0].cpu().numpy()" ], "metadata": { "id": "-5-yNrBmowqS" }, "execution_count": 41, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Define class for working with bert and dataframe" ], "metadata": { "id": "vYrPp27yYapK" } }, { "cell_type": "code", "source": [ "class BERTSearchEngine:\n", " def __init__(self, model, tokenizer, text_database):\n", " self.raw_procesed_data = [self.preprocess(sample, tokenizer) for sample in text_database]\n", " self.base = []\n", " self.retriever = None\n", " self.inverted_index = {}\n", " self._init_retriever(model, tokenizer, text_database)\n", " self._init_inverted_index(text_database)\n", "\n", " @staticmethod\n", " def preprocess(sentence: str, tokenizer):\n", " return tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')\n", "\n", " def _embed_bert_cls(self, tokenized_text: dict[torch.Tensor]) -> np.array:\n", " with torch.no_grad():\n", " model_output = self.retriever(**{k: v.to(self.retriever.device) for k, v in tokenized_text.items()})\n", " embeddings = model_output.last_hidden_state[:, 0, :]\n", " embeddings = torch.nn.functional.normalize(embeddings)\n", " return embeddings[0].cpu().numpy()\n", "\n", " def _init_retriever(self, model, tokenizer, text_database):\n", " self.retriever = model\n", " self.tokenizer = tokenizer\n", " self.base = np.array([self._embed_bert_cls(self.preprocess(text, tokenizer)) for text in tqdm(text_database)])\n", "\n", " def retrieve(self, query: str) -> np.array:\n", " return self._embed_bert_cls(self.preprocess(query, self.tokenizer))\n", "\n", " def retrieve_documents(self, query: str, top_k=3) -> list[int]:\n", " query_vector = self.retrieve(query)\n", " cosine_similarities = cosine_similarity([query_vector], self.base).flatten()\n", " relevant_indices = np.argsort(cosine_similarities, axis=0)[::-1][:top_k]\n", " return relevant_indices.tolist()\n", "\n", " def _init_inverted_index(self, text_database: list[str]):\n", " self.inverted_index = dict(enumerate(text_database))\n", "\n", " def display_relevant_docs(self, query, full_database, top_k=3) -> list[int]:\n", " docs_indexes = self.retrieve_documents(query, top_k=top_k)\n", " return [self.inverted_index[ind] for ind in docs_indexes]\n", "\n", " def find_answer(self, query: str) -> int:\n", " query_vector = self.retrieve(query)\n", " cosine_similarities = cosine_similarity([query_vector], self.base).flatten()\n", " relevant_indice = np.argmax(cosine_similarities, axis=0)\n", " return relevant_indice\n", "\n", "\n", "simple_search_engine = BERTSearchEngine(model, tokenizer, df['question'])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 87, "referenced_widgets": [ "f5e534edc11044aa936378c8cf562b38", "25146f192d584689b4e371436f6d8b0a", "689566da96cf4e169e1eb7be44ca1f74", "b095a42665b147e0b180790051f4b330", "04fec7188d2741b9a62a8ba2471bc763", "6506147f04b144a69ce6968bcee16ab8", "9cc61be2e6714cc8babdd4e59614c75e", "a85f5ac290a74822b073cf9eeedadc03", "e85eeabeede8410d8b5a66953ce74b15", "7bb6173bd48149b490e329b9117c7270", "804707107d0b42108b26bb156310a326" ] }, "id": "DOXdVSbs2CL-", "outputId": "6b247c46-2920-466f-ac74-a2adc889eded" }, "execution_count": 42, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " 0%| | 0/8058 [00:00 1:\n", " memory_weights = np.array([0.1, 0.3, 1.0]) # .reshape((3,1))\n", " # take last two sentences in accordance to bot's memory\n", " history = history[-2:]\n", "\n", " else:\n", " memory_weights = np.array([0.3, 1.0])\n", "\n", " history_sentence = np.zeros(shape=(len_history+1, 384))\n", "\n", " for ind, h in enumerate(history):\n", "\n", " sentence = simple_search_engine.retrieve(h)\n", " history_sentence[ind] = sentence * memory_weights[ind]\n", "\n", " question = simple_search_engine.retrieve(question)\n", "\n", " history_sentence[-1] = question\n", " history_sentence = history_sentence.mean(axis=0).reshape(1, -1)\n", "\n", " cosine_similarities = cosine_similarity(history_sentence, simple_search_engine.base).flatten()\n", " relevant_indice = np.argmax(cosine_similarities, axis=0)\n", " answer = df['answer'].loc[relevant_indice]\n", "\n", " return answer" ], "metadata": { "id": "Xr4f7h1jkYVd" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "chat_bert(question=\"rachel! what are you doing here?\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 36 }, "outputId": "e1f9bb96-2ba3-4bff-fa7e-f270eb494190", "id": "Dh1ltgOOoO8D" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'i’m just visiting my good friend carol.'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 52 } ] }, { "cell_type": "code", "source": [ "chat_word2vec_context(question=\"saw the porsche out there mon, lookin’ good.\", history=[[\"hey uh mon, i saw the porsche parked out front...\", \"wait a minute! you let ross drive the porsche and when i ask you, you say you’re the only one who’s allowed to drive it.\"]])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 36 }, "outputId": "488f3039-e162-42fd-9cce-0e874758f9d4", "id": "aPmEe5C7oO8D" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'wait a minute! you let ross drive the porsche and when i ask you, you say you’re the only one who’s allowed to drive it.'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 254 } ] }, { "cell_type": "code", "source": [ "chat_bert(\"rachel! what are you doing here?\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 36 }, "id": "Kn_F-cXlWQfg", "outputId": "70367a5c-8056-4627-9a45-f37f4caaed45" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'i’m just visiting my good friend carol.'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 255 } ] }, { "cell_type": "markdown", "source": [ "## Install gradio for chatbot" ], "metadata": { "id": "kmY56gM3YvkE" } }, { "cell_type": "code", "source": [ "!pip install -q gradio" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "kPW1c-Xdp3e1", "outputId": "02b93ca6-6c71-4f39-f21e-ef6d48e07cf6" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.9/16.9 MB\u001b[0m \u001b[31m36.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.1/92.1 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m307.7/307.7 kB\u001b[0m \u001b[31m27.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m7.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m139.0/139.0 kB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m47.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.8/60.8 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.9/129.9 kB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.5/71.5 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.0/77.0 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Building wheel for ffmpy (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "lida 0.0.10 requires kaleido, which is not installed.\u001b[0m\u001b[31m\n", "\u001b[0m" ] } ] }, { "cell_type": "code", "source": [ "import gradio as gr\n", "import time\n", "\n", "def echo(message, history, model):\n", "\n", " print(history)\n", " if model==\"TF-IDF\":\n", " # answer = chat_tfidf(message)\n", " answer = chat_tfidf_context(message, history)\n", " return answer\n", "\n", " elif model==\"W2V\":\n", " # answer = chat_word2vec(message)\n", " answer = chat_word2vec_context(message, history)\n", " return answer\n", "\n", " elif model==\"BERT\":\n", " answer = chat_bert_context(message, history)\n", " return answer\n", "\n", "\n", "\n", "\n", "title = \"Chatbot who speaks like Rachel from Friends\"\n", "description = \"You have a good opportunity to have a dialog with friend's actor - Rachel Green\"\n", "\n", "# model = gr.CheckboxGroup([\"TF-IDF\", \"W2V\", \"BERT\", \"BI-Encoder\", \"Cross-Encoder\"], label=\"Model\", info=\"What model do you want to use?\", value=\"TF-IDF\")\n", "model = gr.Dropdown([\"TF-IDF\", \"W2V\", \"BERT\", \"BI-Encoder\", \"Cross-Encoder\"], label=\"Retrieval model\", info=\"What model do you want to use?\", value=\"TF-IDF\")\n", "\n", "with gr.Blocks() as demo:\n", "\n", " gr.ChatInterface(\n", " fn=echo,\n", " title=title,\n", " description=description,\n", " additional_inputs=[model],\n", " retry_btn=None,\n", " undo_btn=None,\n", " clear_btn=None,\n", " )\n", "\n", "demo.launch(debug=True)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 796 }, "id": "SMgp4cWn495R", "outputId": "fa9047de-5c3c-46e3-a13e-1499f57b012e" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n", "\n", "Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().\n", "Running on public URL: https://55ffad7ddcb9b39da4.gradio.live\n", "\n", "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "
" ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "[]\n", "[['hey', 'hi']]\n", "[['hey', 'hi'], ['How are you?', 'ross!']]\n", "[['hey', 'hi'], ['How are you?', 'ross!'], ['What?', 'no, you! phoebe you freaked me out. you kept saying how huge this all is!']]\n", "[['hey', 'hi'], ['How are you?', 'ross!'], ['What?', 'no, you! phoebe you freaked me out. you kept saying how huge this all is!'], ['Do you want to play something with me?', 'well yeah, i wish that you would. well, no it’s not in there! how about that drawer?']]\n", "[['hey', 'hi'], ['How are you?', 'ross!'], ['What?', 'no, you! phoebe you freaked me out. you kept saying how huge this all is!'], ['Do you want to play something with me?', 'well yeah, i wish that you would. well, no it’s not in there! how about that drawer?'], ['What do you think about children?', 'no paul, i don’t know anything about you! y’know, like-like your childhood! tell me about your childhood!']]\n", "Keyboard interruption in main thread... closing server.\n", "Killing tunnel 127.0.0.1:7860 <> https://55ffad7ddcb9b39da4.gradio.live\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [] }, "metadata": {}, "execution_count": 260 } ] }, { "cell_type": "code", "source": [ "df[[\"question\", \"answer\"]].head(10)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 363 }, "id": "gQ7-CLIkAeLQ", "outputId": "b420f5de-afa8-4ad6-f6b7-2ac9de511c80" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " question \\\n", "0 hey uh mon, i saw the porsche parked out front... \n", "1 saw the porsche out there mon, lookin’ good. w... \n", "2 nice work everybody! so much for the y’know, y... \n", "3 whew! that was a brisk ride! \n", "4 only way to fly. \n", "5 you’re fast and irresponsible. that adds up to... \n", "6 did you see the look that girl just gave me? h... \n", "7 what?! give-give me a brush. \n", "8 no way! \n", "9 fine! y’know what? it doesn’t matter, because,... \n", "\n", " answer \n", "0 wait a minute! you let ross drive the porsche... \n", "1 you let joey drive it?! \n", "2 wow! i can’t believe you lied to me. \n", "3 take the top down did ya? \n", "4 come on ross give me the keys! monica does not... \n", "5 well in high school, that added up to head che... \n", "6 i think she’s checking out your beehive ross. \n", "7 gimme the keys! \n", "8 well no brush! \n", "9 alimony. " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
questionanswer
0hey uh mon, i saw the porsche parked out front...wait a minute! you let ross drive the porsche...
1saw the porsche out there mon, lookin’ good. w...you let joey drive it?!
2nice work everybody! so much for the y’know, y...wow! i can’t believe you lied to me.
3whew! that was a brisk ride!take the top down did ya?
4only way to fly.come on ross give me the keys! monica does not...
5you’re fast and irresponsible. that adds up to...well in high school, that added up to head che...
6did you see the look that girl just gave me? h...i think she’s checking out your beehive ross.
7what?! give-give me a brush.gimme the keys!
8no way!well no brush!
9fine! y’know what? it doesn’t matter, because,...alimony.
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"df[[\\\"question\\\", \\\"answer\\\"]]\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"question\",\n \"properties\": {\n \"dtype\": \"string\",\n \"samples\": [\n \"no way!\",\n \"saw the porsche out there mon, lookin\\u2019 good. when do i get to take that baby out again?\",\n \"you\\u2019re fast and irresponsible. that adds up to a bad driver.\"\n ],\n \"num_unique_values\": 10,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"answer\",\n \"properties\": {\n \"dtype\": \"string\",\n \"samples\": [\n \"well no brush!\",\n \"you let joey drive it?!\",\n \"well in high school, that added up to head cheerleader.\"\n ],\n \"num_unique_values\": 10,\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 69 } ] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "\n", "# Example DataFrame\n", "data = {\n", " \"name_1\": [\"Alice\", \"Bob\", \"Charlie\"],\n", " \"sentence2\": [\"This is sentence 1\", \"This is sentence 2\", \"This is sentence 3\"],\n", " \"name_2\": [\"Dave\", \"Eve\", \"Frank\"],\n", " \"sentence_2\": [\"Another sentence 1\", \"Another sentence 2\", \"Another sentence 3\"],\n", " \"label\": [0, 1, 0]\n", "}\n", "df = pd.DataFrame(data)\n", "\n", "# Create a list of dictionaries\n", "list_of_dicts = []\n", "for index, row in df.iterrows():\n", " # Create a dictionary for the current row\n", " row_dict = {\n", " \"name_1\": row[\"name_1\"],\n", " \"sentence2\": row[\"sentence2\"],\n", " \"name_2\": row[\"name_2\"],\n", " \"sentence_2\": row[\"sentence_2\"],\n", " \"label\": row[\"label\"]\n", " }\n", " # Append the dictionary to the list\n", " list_of_dicts.append(row_dict)\n", "\n", "# Print the list of dictionaries\n", "print(list_of_dicts)" ], "metadata": { "id": "mRJzBW_cCr4n" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "_0L80aNvp4vM" }, "execution_count": null, "outputs": [] } ] }