{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100\n", "200\n", "300\n", "400\n", "500\n", "600\n", "700\n", "800\n", "900\n", "1000\n", "1100\n", "1200\n", "1300\n", "1400\n", "1500\n", "1600\n", "1700\n", "1800\n", "full_model_names\n", "1889\n", "organization_names\n", "12\n", "['Parameters', 'drop|3', 'gsm8k', 'MMLU_average', 'winogrande', 'all', 'arc:challenge|25', 'hellaswag|10', 'MMLU_abstract_algebra', 'MMLU_anatomy', 'MMLU_astronomy', 'MMLU_business_ethics', 'MMLU_clinical_knowledge', 'MMLU_college_biology', 'MMLU_college_chemistry', 'MMLU_college_computer_science', 'MMLU_college_mathematics', 'MMLU_college_medicine', 'MMLU_college_physics', 'MMLU_computer_security', 'MMLU_conceptual_physics', 'MMLU_econometrics', 'MMLU_electrical_engineering', 'MMLU_elementary_mathematics', 'MMLU_formal_logic', 'MMLU_global_facts', 'MMLU_high_school_biology', 'MMLU_high_school_chemistry', 'MMLU_high_school_computer_science', 'MMLU_high_school_european_history', 'MMLU_high_school_geography', 'MMLU_high_school_government_and_politics', 'MMLU_high_school_macroeconomics', 'MMLU_high_school_mathematics', 'MMLU_high_school_microeconomics', 'MMLU_high_school_physics', 'MMLU_high_school_psychology', 'MMLU_high_school_statistics', 'MMLU_high_school_us_history', 'MMLU_high_school_world_history', 'MMLU_human_aging', 'MMLU_human_sexuality', 'MMLU_international_law', 'MMLU_jurisprudence', 'MMLU_logical_fallacies', 'MMLU_machine_learning', 'MMLU_management', 'MMLU_marketing', 'MMLU_medical_genetics', 'MMLU_miscellaneous', 'MMLU_moral_disputes', 'MMLU_moral_scenarios', 'MMLU_nutrition', 'MMLU_philosophy', 'MMLU_prehistory', 'MMLU_professional_accounting', 'MMLU_professional_law', 'MMLU_professional_medicine', 'MMLU_professional_psychology', 'MMLU_public_relations', 'MMLU_security_studies', 'MMLU_sociology', 'MMLU_us_foreign_policy', 'MMLU_virology', 'MMLU_world_religions', 'truthfulqa:mc|0', 'full_model_name']\n" ] } ], "source": [ "from result_data_processor import ResultDataProcessor\n", "result = ResultDataProcessor()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>URL</th>\n", " <th>full_model_name</th>\n", " <th>Parameters</th>\n", " <th>MMLU_average</th>\n", " <th>arc:challenge|25</th>\n", " <th>hellaswag|10</th>\n", " <th>MMLU_abstract_algebra</th>\n", " <th>MMLU_anatomy</th>\n", " <th>MMLU_astronomy</th>\n", " <th>MMLU_business_ethics</th>\n", " <th>...</th>\n", " <th>MMLU_professional_accounting</th>\n", " <th>MMLU_professional_law</th>\n", " <th>MMLU_professional_medicine</th>\n", " <th>MMLU_professional_psychology</th>\n", " <th>MMLU_public_relations</th>\n", " <th>MMLU_security_studies</th>\n", " <th>MMLU_sociology</th>\n", " <th>MMLU_us_foreign_policy</th>\n", " <th>MMLU_virology</th>\n", " <th>MMLU_world_religions</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>SparseOPT-1.3B</th>\n", " <td>https://huggingface.co/shaohang/SparseOPT-1.3B</td>\n", " <td>shaohang/SparseOPT-1.3B</td>\n", " <td>1.3</td>\n", " <td>0.255963</td>\n", " <td>0.240614</td>\n", " <td>0.383689</td>\n", " <td>0.22</td>\n", " <td>0.214815</td>\n", " <td>0.157895</td>\n", " <td>0.20</td>\n", " <td>...</td>\n", " <td>0.262411</td>\n", " <td>0.238592</td>\n", " <td>0.448529</td>\n", " <td>0.254902</td>\n", " <td>0.236364</td>\n", " <td>0.171429</td>\n", " <td>0.228856</td>\n", " <td>0.27</td>\n", " <td>0.283133</td>\n", " <td>0.216374</td>\n", " </tr>\n", " <tr>\n", " <th>Athena-v1</th>\n", " <td>https://huggingface.co/IkariDev/Athena-v1</td>\n", " <td>IkariDev/Athena-v1</td>\n", " <td>NaN</td>\n", " <td>0.556052</td>\n", " <td>0.560580</td>\n", " <td>0.631548</td>\n", " <td>0.31</td>\n", " <td>0.496296</td>\n", " <td>0.526316</td>\n", " <td>0.58</td>\n", " <td>...</td>\n", " <td>0.404255</td>\n", " <td>0.392438</td>\n", " <td>0.525735</td>\n", " <td>0.540850</td>\n", " <td>0.645455</td>\n", " <td>0.640816</td>\n", " <td>0.751244</td>\n", " <td>0.83</td>\n", " <td>0.493976</td>\n", " <td>0.725146</td>\n", " </tr>\n", " <tr>\n", " <th>Athena-tmp</th>\n", " <td>https://huggingface.co/IkariDev/Athena-tmp</td>\n", " <td>IkariDev/Athena-tmp</td>\n", " <td>NaN</td>\n", " <td>0.588685</td>\n", " <td>0.567406</td>\n", " <td>0.621888</td>\n", " <td>0.29</td>\n", " <td>0.518519</td>\n", " <td>0.638158</td>\n", " <td>0.62</td>\n", " <td>...</td>\n", " <td>0.450355</td>\n", " <td>0.462842</td>\n", " <td>0.569853</td>\n", " <td>0.588235</td>\n", " <td>0.645455</td>\n", " <td>0.653061</td>\n", " <td>0.721393</td>\n", " <td>0.81</td>\n", " <td>0.463855</td>\n", " <td>0.801170</td>\n", " </tr>\n", " <tr>\n", " <th>13B-Legerdemain-L2</th>\n", " <td>https://huggingface.co/CalderaAI/13B-Legerdema...</td>\n", " <td>CalderaAI/13B-Legerdemain-L2</td>\n", " <td>13.0</td>\n", " <td>0.560030</td>\n", " <td>0.573379</td>\n", " <td>0.635431</td>\n", " <td>0.36</td>\n", " <td>0.525926</td>\n", " <td>0.572368</td>\n", " <td>0.53</td>\n", " <td>...</td>\n", " <td>0.429078</td>\n", " <td>0.424381</td>\n", " <td>0.522059</td>\n", " <td>0.532680</td>\n", " <td>0.609091</td>\n", " <td>0.636735</td>\n", " <td>0.766169</td>\n", " <td>0.87</td>\n", " <td>0.427711</td>\n", " <td>0.777778</td>\n", " </tr>\n", " <tr>\n", " <th>13B-Ouroboros</th>\n", " <td>https://huggingface.co/CalderaAI/13B-Ouroboros</td>\n", " <td>CalderaAI/13B-Ouroboros</td>\n", " <td>13.0</td>\n", " <td>0.514311</td>\n", " <td>0.560580</td>\n", " <td>0.624378</td>\n", " <td>0.31</td>\n", " <td>0.466667</td>\n", " <td>0.506579</td>\n", " <td>0.52</td>\n", " <td>...</td>\n", " <td>0.365248</td>\n", " <td>0.405476</td>\n", " <td>0.481618</td>\n", " <td>0.524510</td>\n", " <td>0.609091</td>\n", " <td>0.538776</td>\n", " <td>0.691542</td>\n", " <td>0.83</td>\n", " <td>0.457831</td>\n", " <td>0.760234</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>Robin-v2</th>\n", " <td>https://huggingface.co/HanningZhang/Robin-v2</td>\n", " <td>HanningZhang/Robin-v2</td>\n", " <td>NaN</td>\n", " <td>0.392680</td>\n", " <td>0.435154</td>\n", " <td>0.545310</td>\n", " <td>0.32</td>\n", " <td>0.437037</td>\n", " <td>0.335526</td>\n", " <td>0.46</td>\n", " <td>...</td>\n", " <td>0.290780</td>\n", " <td>0.302477</td>\n", " <td>0.382353</td>\n", " <td>0.374183</td>\n", " <td>0.445455</td>\n", " <td>0.326531</td>\n", " <td>0.457711</td>\n", " <td>0.59</td>\n", " <td>0.379518</td>\n", " <td>0.590643</td>\n", " </tr>\n", " <tr>\n", " <th>CodeUp-Llama-2-13b-chat-hf</th>\n", " <td>https://huggingface.co/deepse/CodeUp-Llama-2-1...</td>\n", " <td>deepse/CodeUp-Llama-2-13b-chat-hf</td>\n", " <td>13.0</td>\n", " <td>0.546262</td>\n", " <td>0.558020</td>\n", " <td>0.629257</td>\n", " <td>0.31</td>\n", " <td>0.474074</td>\n", " <td>0.546053</td>\n", " <td>0.53</td>\n", " <td>...</td>\n", " <td>0.390071</td>\n", " <td>0.391786</td>\n", " <td>0.500000</td>\n", " <td>0.544118</td>\n", " <td>0.663636</td>\n", " <td>0.636735</td>\n", " <td>0.751244</td>\n", " <td>0.81</td>\n", " <td>0.481928</td>\n", " <td>0.730994</td>\n", " </tr>\n", " <tr>\n", " <th>Hermes-Platypus2-mini-7B</th>\n", " <td>https://huggingface.co/edor/Hermes-Platypus2-m...</td>\n", " <td>edor/Hermes-Platypus2-mini-7B</td>\n", " <td>7.0</td>\n", " <td>0.470828</td>\n", " <td>0.523038</td>\n", " <td>0.601573</td>\n", " <td>0.33</td>\n", " <td>0.488889</td>\n", " <td>0.421053</td>\n", " <td>0.48</td>\n", " <td>...</td>\n", " <td>0.390071</td>\n", " <td>0.353977</td>\n", " <td>0.470588</td>\n", " <td>0.446078</td>\n", " <td>0.518182</td>\n", " <td>0.563265</td>\n", " <td>0.621891</td>\n", " <td>0.68</td>\n", " <td>0.421687</td>\n", " <td>0.637427</td>\n", " </tr>\n", " <tr>\n", " <th>Stable-Platypus2-mini-7B</th>\n", " <td>https://huggingface.co/edor/Stable-Platypus2-m...</td>\n", " <td>edor/Stable-Platypus2-mini-7B</td>\n", " <td>7.0</td>\n", " <td>0.517800</td>\n", " <td>0.523891</td>\n", " <td>0.596594</td>\n", " <td>0.37</td>\n", " <td>0.488889</td>\n", " <td>0.407895</td>\n", " <td>0.50</td>\n", " <td>...</td>\n", " <td>0.390071</td>\n", " <td>0.391786</td>\n", " <td>0.518382</td>\n", " <td>0.509804</td>\n", " <td>0.618182</td>\n", " <td>0.657143</td>\n", " <td>0.631841</td>\n", " <td>0.73</td>\n", " <td>0.427711</td>\n", " <td>0.695906</td>\n", " </tr>\n", " <tr>\n", " <th>llava-v1.5-13b-hf</th>\n", " <td>https://huggingface.co/Community-LM/llava-v1.5...</td>\n", " <td>Community-LM/llava-v1.5-13b-hf</td>\n", " <td>13.0</td>\n", " <td>0.568868</td>\n", " <td>0.532423</td>\n", " <td>0.601175</td>\n", " <td>0.30</td>\n", " <td>0.496296</td>\n", " <td>0.585526</td>\n", " <td>0.67</td>\n", " <td>...</td>\n", " <td>0.407801</td>\n", " <td>0.415906</td>\n", " <td>0.547794</td>\n", " <td>0.578431</td>\n", " <td>0.600000</td>\n", " <td>0.653061</td>\n", " <td>0.761194</td>\n", " <td>0.81</td>\n", " <td>0.506024</td>\n", " <td>0.795322</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>1121 rows × 63 columns</p>\n", "</div>" ], "text/plain": [ " URL \\\n", "SparseOPT-1.3B https://huggingface.co/shaohang/SparseOPT-1.3B \n", "Athena-v1 https://huggingface.co/IkariDev/Athena-v1 \n", "Athena-tmp https://huggingface.co/IkariDev/Athena-tmp \n", "13B-Legerdemain-L2 https://huggingface.co/CalderaAI/13B-Legerdema... \n", "13B-Ouroboros https://huggingface.co/CalderaAI/13B-Ouroboros \n", "... ... \n", "Robin-v2 https://huggingface.co/HanningZhang/Robin-v2 \n", "CodeUp-Llama-2-13b-chat-hf https://huggingface.co/deepse/CodeUp-Llama-2-1... \n", "Hermes-Platypus2-mini-7B https://huggingface.co/edor/Hermes-Platypus2-m... \n", "Stable-Platypus2-mini-7B https://huggingface.co/edor/Stable-Platypus2-m... \n", "llava-v1.5-13b-hf https://huggingface.co/Community-LM/llava-v1.5... \n", "\n", " full_model_name Parameters \\\n", "SparseOPT-1.3B shaohang/SparseOPT-1.3B 1.3 \n", "Athena-v1 IkariDev/Athena-v1 NaN \n", "Athena-tmp IkariDev/Athena-tmp NaN \n", "13B-Legerdemain-L2 CalderaAI/13B-Legerdemain-L2 13.0 \n", "13B-Ouroboros CalderaAI/13B-Ouroboros 13.0 \n", "... ... ... \n", "Robin-v2 HanningZhang/Robin-v2 NaN \n", "CodeUp-Llama-2-13b-chat-hf deepse/CodeUp-Llama-2-13b-chat-hf 13.0 \n", "Hermes-Platypus2-mini-7B edor/Hermes-Platypus2-mini-7B 7.0 \n", "Stable-Platypus2-mini-7B edor/Stable-Platypus2-mini-7B 7.0 \n", "llava-v1.5-13b-hf Community-LM/llava-v1.5-13b-hf 13.0 \n", "\n", " MMLU_average arc:challenge|25 hellaswag|10 \\\n", "SparseOPT-1.3B 0.255963 0.240614 0.383689 \n", "Athena-v1 0.556052 0.560580 0.631548 \n", "Athena-tmp 0.588685 0.567406 0.621888 \n", "13B-Legerdemain-L2 0.560030 0.573379 0.635431 \n", "13B-Ouroboros 0.514311 0.560580 0.624378 \n", "... ... ... ... \n", "Robin-v2 0.392680 0.435154 0.545310 \n", "CodeUp-Llama-2-13b-chat-hf 0.546262 0.558020 0.629257 \n", "Hermes-Platypus2-mini-7B 0.470828 0.523038 0.601573 \n", "Stable-Platypus2-mini-7B 0.517800 0.523891 0.596594 \n", "llava-v1.5-13b-hf 0.568868 0.532423 0.601175 \n", "\n", " MMLU_abstract_algebra MMLU_anatomy \\\n", "SparseOPT-1.3B 0.22 0.214815 \n", "Athena-v1 0.31 0.496296 \n", "Athena-tmp 0.29 0.518519 \n", "13B-Legerdemain-L2 0.36 0.525926 \n", "13B-Ouroboros 0.31 0.466667 \n", "... ... ... \n", "Robin-v2 0.32 0.437037 \n", "CodeUp-Llama-2-13b-chat-hf 0.31 0.474074 \n", "Hermes-Platypus2-mini-7B 0.33 0.488889 \n", "Stable-Platypus2-mini-7B 0.37 0.488889 \n", "llava-v1.5-13b-hf 0.30 0.496296 \n", "\n", " MMLU_astronomy MMLU_business_ethics ... \\\n", "SparseOPT-1.3B 0.157895 0.20 ... \n", "Athena-v1 0.526316 0.58 ... \n", "Athena-tmp 0.638158 0.62 ... \n", "13B-Legerdemain-L2 0.572368 0.53 ... \n", "13B-Ouroboros 0.506579 0.52 ... \n", "... ... ... ... \n", "Robin-v2 0.335526 0.46 ... \n", "CodeUp-Llama-2-13b-chat-hf 0.546053 0.53 ... \n", "Hermes-Platypus2-mini-7B 0.421053 0.48 ... \n", "Stable-Platypus2-mini-7B 0.407895 0.50 ... \n", "llava-v1.5-13b-hf 0.585526 0.67 ... \n", "\n", " MMLU_professional_accounting \\\n", "SparseOPT-1.3B 0.262411 \n", "Athena-v1 0.404255 \n", "Athena-tmp 0.450355 \n", "13B-Legerdemain-L2 0.429078 \n", "13B-Ouroboros 0.365248 \n", "... ... \n", "Robin-v2 0.290780 \n", "CodeUp-Llama-2-13b-chat-hf 0.390071 \n", "Hermes-Platypus2-mini-7B 0.390071 \n", "Stable-Platypus2-mini-7B 0.390071 \n", "llava-v1.5-13b-hf 0.407801 \n", "\n", " MMLU_professional_law MMLU_professional_medicine \\\n", "SparseOPT-1.3B 0.238592 0.448529 \n", "Athena-v1 0.392438 0.525735 \n", "Athena-tmp 0.462842 0.569853 \n", "13B-Legerdemain-L2 0.424381 0.522059 \n", "13B-Ouroboros 0.405476 0.481618 \n", "... ... ... \n", "Robin-v2 0.302477 0.382353 \n", "CodeUp-Llama-2-13b-chat-hf 0.391786 0.500000 \n", "Hermes-Platypus2-mini-7B 0.353977 0.470588 \n", "Stable-Platypus2-mini-7B 0.391786 0.518382 \n", "llava-v1.5-13b-hf 0.415906 0.547794 \n", "\n", " MMLU_professional_psychology \\\n", "SparseOPT-1.3B 0.254902 \n", "Athena-v1 0.540850 \n", "Athena-tmp 0.588235 \n", "13B-Legerdemain-L2 0.532680 \n", "13B-Ouroboros 0.524510 \n", "... ... \n", "Robin-v2 0.374183 \n", "CodeUp-Llama-2-13b-chat-hf 0.544118 \n", "Hermes-Platypus2-mini-7B 0.446078 \n", "Stable-Platypus2-mini-7B 0.509804 \n", "llava-v1.5-13b-hf 0.578431 \n", "\n", " MMLU_public_relations MMLU_security_studies \\\n", "SparseOPT-1.3B 0.236364 0.171429 \n", "Athena-v1 0.645455 0.640816 \n", "Athena-tmp 0.645455 0.653061 \n", "13B-Legerdemain-L2 0.609091 0.636735 \n", "13B-Ouroboros 0.609091 0.538776 \n", "... ... ... \n", "Robin-v2 0.445455 0.326531 \n", "CodeUp-Llama-2-13b-chat-hf 0.663636 0.636735 \n", "Hermes-Platypus2-mini-7B 0.518182 0.563265 \n", "Stable-Platypus2-mini-7B 0.618182 0.657143 \n", "llava-v1.5-13b-hf 0.600000 0.653061 \n", "\n", " MMLU_sociology MMLU_us_foreign_policy \\\n", "SparseOPT-1.3B 0.228856 0.27 \n", "Athena-v1 0.751244 0.83 \n", "Athena-tmp 0.721393 0.81 \n", "13B-Legerdemain-L2 0.766169 0.87 \n", "13B-Ouroboros 0.691542 0.83 \n", "... ... ... \n", "Robin-v2 0.457711 0.59 \n", "CodeUp-Llama-2-13b-chat-hf 0.751244 0.81 \n", "Hermes-Platypus2-mini-7B 0.621891 0.68 \n", "Stable-Platypus2-mini-7B 0.631841 0.73 \n", "llava-v1.5-13b-hf 0.761194 0.81 \n", "\n", " MMLU_virology MMLU_world_religions \n", "SparseOPT-1.3B 0.283133 0.216374 \n", "Athena-v1 0.493976 0.725146 \n", "Athena-tmp 0.463855 0.801170 \n", "13B-Legerdemain-L2 0.427711 0.777778 \n", "13B-Ouroboros 0.457831 0.760234 \n", "... ... ... \n", "Robin-v2 0.379518 0.590643 \n", "CodeUp-Llama-2-13b-chat-hf 0.481928 0.730994 \n", "Hermes-Platypus2-mini-7B 0.421687 0.637427 \n", "Stable-Platypus2-mini-7B 0.427711 0.695906 \n", "llava-v1.5-13b-hf 0.506024 0.795322 \n", "\n", "[1121 rows x 63 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = result.data\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "mmlu", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }