noneUsername/shisa-v2-mistral-small-24b-W8A8

vllm (pretrained=/root/autodl-tmp/shisa-v2-mistral-small-24b,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.904	±	0.0187
		strict-match	5	exact_match	↑	0.896	±	0.0193

vllm (pretrained=/root/autodl-tmp/shisa-v2-mistral-small-24b,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.910	±	0.0128
		strict-match	5	exact_match	↑	0.894	±	0.0138

vllm (pretrained=/root/autodl-tmp/shisa-v2-mistral-small-24b,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7801	±	0.0132
- humanities	2	none	acc	↑	0.8359	±	0.0253
- other	2	none	acc	↑	0.7897	±	0.0272
- social sciences	2	none	acc	↑	0.8167	±	0.0280
- stem	2	none	acc	↑	0.7123	±	0.0248

vllm (pretrained=/root/autodl-tmp/shisa-v2-mistral-small-24b-W8A8-INT8,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.756	±	0.0272
		strict-match	5	exact_match	↑	0.788	±	0.0259

vllm (pretrained=/root/autodl-tmp/shisa-v2-mistral-small-24b-W8A8-INT8,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.772	±	0.0188
		strict-match	5	exact_match	↑	0.790	±	0.0182

vllm (pretrained=/root/autodl-tmp/shisa-v2-mistral-small-24b-W8A8-INT8,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7450	±	0.0137
- humanities	2	none	acc	↑	0.8000	±	0.0257
- other	2	none	acc	↑	0.7487	±	0.0283
- social sciences	2	none	acc	↑	0.8000	±	0.0286
- stem	2	none	acc	↑	0.6702	±	0.0259

vllm (pretrained=/root/autodl-tmp/70-10-128,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.888	±	0.0200
		strict-match	5	exact_match	↑	0.864	±	0.0217

vllm (pretrained=/root/autodl-tmp/70-10-128,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.894	±	0.0138
		strict-match	5	exact_match	↑	0.866	±	0.0152

vllm (pretrained=/root/autodl-tmp/70-10-128,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7743	±	0.0135
- humanities	2	none	acc	↑	0.8000	±	0.0255
- other	2	none	acc	↑	0.7897	±	0.0275
- social sciences	2	none	acc	↑	0.8056	±	0.0289
- stem	2	none	acc	↑	0.7263	±	0.0252

vllm (pretrained=/root/autodl-tmp/70-10-256,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.888	±	0.0200
		strict-match	5	exact_match	↑	0.852	±	0.0225

vllm (pretrained=/root/autodl-tmp/70-10-256,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.900	±	0.0134
		strict-match	5	exact_match	↑	0.862	±	0.0154

vllm (pretrained=/root/autodl-tmp/70-10-256,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7731	±	0.0134
- humanities	2	none	acc	↑	0.8103	±	0.0255
- other	2	none	acc	↑	0.7744	±	0.0280
- social sciences	2	none	acc	↑	0.8056	±	0.0285
- stem	2	none	acc	↑	0.7263	±	0.0250

vllm (pretrained=/root/autodl-tmp/70-10-512,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.884	±	0.0203
		strict-match	5	exact_match	↑	0.864	±	0.0217

vllm (pretrained=/root/autodl-tmp/70-10-512,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.892	±	0.0139
		strict-match	5	exact_match	↑	0.860	±	0.0155

vllm (pretrained=/root/autodl-tmp/70-10-512,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7801	±	0.0133
- humanities	2	none	acc	↑	0.8154	±	0.0258
- other	2	none	acc	↑	0.7846	±	0.0282
- social sciences	2	none	acc	↑	0.8222	±	0.0275
- stem	2	none	acc	↑	0.7263	±	0.0244

vllm (pretrained=/root/autodl-tmp/70-10-1024,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.892	±	0.0197
		strict-match	5	exact_match	↑	0.872	±	0.0212

vllm (pretrained=/root/autodl-tmp/70-10-1024,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.904	±	0.0132
		strict-match	5	exact_match	↑	0.878	±	0.0147

vllm (pretrained=/root/autodl-tmp/70-10-1024,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7696	±	0.0135
- humanities	2	none	acc	↑	0.7949	±	0.0259
- other	2	none	acc	↑	0.7641	±	0.0289
- social sciences	2	none	acc	↑	0.8222	±	0.0276
- stem	2	none	acc	↑	0.7228	±	0.0250

vllm (pretrained=/root/autodl-tmp/70-10-2048,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.860	±	0.022
		strict-match	5	exact_match	↑	0.844	±	0.023

vllm (pretrained=/root/autodl-tmp/70-10-2048,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.880	±	0.0145
		strict-match	5	exact_match	↑	0.856	±	0.0157

vllm (pretrained=/root/autodl-tmp/70-10-2048,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7684	±	0.0133
- humanities	2	none	acc	↑	0.8154	±	0.0251
- other	2	none	acc	↑	0.7795	±	0.0280
- social sciences	2	none	acc	↑	0.8167	±	0.0276
- stem	2	none	acc	↑	0.6982	±	0.0250

vllm (pretrained=/root/autodl-tmp/70-09-256,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.872	±	0.0212
		strict-match	5	exact_match	↑	0.860	±	0.0220

vllm (pretrained=/root/autodl-tmp/70-09-256,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.890	±	0.0140
		strict-match	5	exact_match	↑	0.866	±	0.0152

vllm (pretrained=/root/autodl-tmp/70-09-256,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7754	±	0.0134
- humanities	2	none	acc	↑	0.7949	±	0.0265
- other	2	none	acc	↑	0.7846	±	0.0282
- social sciences	2	none	acc	↑	0.8167	±	0.0275
- stem	2	none	acc	↑	0.7298	±	0.0248

vllm (pretrained=/root/autodl-tmp/70-08-256,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.896	±	0.0193
		strict-match	5	exact_match	↑	0.876	±	0.0209

vllm (pretrained=/root/autodl-tmp/70-08-256,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.886	±	0.0142
		strict-match	5	exact_match	↑	0.860	±	0.0155

vllm (pretrained=/root/autodl-tmp/70-08-256,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7649	±	0.0134
- humanities	2	none	acc	↑	0.7949	±	0.0255
- other	2	none	acc	↑	0.7795	±	0.0280
- social sciences	2	none	acc	↑	0.8056	±	0.0288
- stem	2	none	acc	↑	0.7088	±	0.0250

vllm (pretrained=/root/autodl-tmp/75-10-512,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.868	±	0.0215
		strict-match	5	exact_match	↑	0.840	±	0.0232

vllm (pretrained=/root/autodl-tmp/75-10-512,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.888	±	0.0141
		strict-match	5	exact_match	↑	0.860	±	0.0155

vllm (pretrained=/root/autodl-tmp/75-10-512,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7754	±	0.0133
- humanities	2	none	acc	↑	0.8000	±	0.0255
- other	2	none	acc	↑	0.7846	±	0.0276
- social sciences	2	none	acc	↑	0.8222	±	0.0276
- stem	2	none	acc	↑	0.7228	±	0.0252

vllm (pretrained=/root/autodl-tmp/75-7-256,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.888	±	0.0200
		strict-match	5	exact_match	↑	0.856	±	0.0222

vllm (pretrained=/root/autodl-tmp/75-7-256,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.898	±	0.0135
		strict-match	5	exact_match	↑	0.854	±	0.0158

vllm (pretrained=/root/autodl-tmp/75-7-256,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7696	±	0.0134
- humanities	2	none	acc	↑	0.8103	±	0.0248
- other	2	none	acc	↑	0.7692	±	0.0279
- social sciences	2	none	acc	↑	0.8111	±	0.0283
- stem	2	none	acc	↑	0.7158	±	0.0252

vllm (pretrained=/root/autodl-tmp/75-7-512,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.880	±	0.0206
		strict-match	5	exact_match	↑	0.848	±	0.0228

vllm (pretrained=/root/autodl-tmp/75-7-512,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.900	±	0.0134
		strict-match	5	exact_match	↑	0.864	±	0.0153

vllm (pretrained=/root/autodl-tmp/75-7-512,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7801	±	0.0133
- humanities	2	none	acc	↑	0.8103	±	0.0264
- other	2	none	acc	↑	0.8103	±	0.0272
- social sciences	2	none	acc	↑	0.8278	±	0.0273
- stem	2	none	acc	↑	0.7088	±	0.0251

vllm (pretrained=/root/autodl-tmp/75-7-1024,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.860	±	0.0220
		strict-match	5	exact_match	↑	0.832	±	0.0237

vllm (pretrained=/root/autodl-tmp/75-7-1024,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.882	±	0.0144
		strict-match	5	exact_match	↑	0.852	±	0.0159

vllm (pretrained=/root/autodl-tmp/75-7-1024,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7766	±	0.0133
- humanities	2	none	acc	↑	0.8154	±	0.0252
- other	2	none	acc	↑	0.7897	±	0.0271
- social sciences	2	none	acc	↑	0.8056	±	0.0286
- stem	2	none	acc	↑	0.7228	±	0.0250

vllm (pretrained=/root/autodl-tmp/79-1-512,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.884	±	0.0203
		strict-match	5	exact_match	↑	0.860	±	0.0220

vllm (pretrained=/root/autodl-tmp/79-1-512,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.896	±	0.0137
		strict-match	5	exact_match	↑	0.872	±	0.0150

vllm (pretrained=/root/autodl-tmp/79-1-512,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7719	±	0.0135
- humanities	2	none	acc	↑	0.8051	±	0.0259
- other	2	none	acc	↑	0.7846	±	0.0276
- social sciences	2	none	acc	↑	0.8056	±	0.0287
- stem	2	none	acc	↑	0.7193	±	0.0251

vllm (pretrained=/root/autodl-tmp/79-1-1024,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.896	±	0.0193
		strict-match	5	exact_match	↑	0.872	±	0.0212

vllm (pretrained=/root/autodl-tmp/79-1-1024,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.880	±	0.0145
		strict-match	5	exact_match	↑	0.858	±	0.0156

vllm (pretrained=/root/autodl-tmp/79-1-1024,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7778	±	0.0132
- humanities	2	none	acc	↑	0.8154	±	0.0247
- other	2	none	acc	↑	0.7846	±	0.0281
- social sciences	2	none	acc	↑	0.8333	±	0.0270
- stem	2	none	acc	↑	0.7123	±	0.0250

vllm (pretrained=/root/autodl-tmp/795-2-256,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.896	±	0.0193
		strict-match	5	exact_match	↑	0.864	±	0.0217

vllm (pretrained=/root/autodl-tmp/795-2-256,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.906	±	0.0131
		strict-match	5	exact_match	↑	0.866	±	0.0152

vllm (pretrained=/root/autodl-tmp/795-2-256,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7743	±	0.0133
- humanities	2	none	acc	↑	0.8000	±	0.0254
- other	2	none	acc	↑	0.8000	±	0.0273
- social sciences	2	none	acc	↑	0.8333	±	0.0268
- stem	2	none	acc	↑	0.7018	±	0.0254

vllm (pretrained=/root/autodl-tmp/795-2-512,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.884	±	0.0203
		strict-match	5	exact_match	↑	0.860	±	0.0220

vllm (pretrained=/root/autodl-tmp/795-2-512,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.896	±	0.0137
		strict-match	5	exact_match	↑	0.866	±	0.0152

vllm (pretrained=/root/autodl-tmp/795-2-512,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7731	±	0.0134
- humanities	2	none	acc	↑	0.8205	±	0.0250
- other	2	none	acc	↑	0.7692	±	0.0278
- social sciences	2	none	acc	↑	0.7889	±	0.0297
- stem	2	none	acc	↑	0.7333	±	0.0247

vllm (pretrained=/root/autodl-tmp/799-1-256,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.892	±	0.0197
		strict-match	5	exact_match	↑	0.864	±	0.0217

vllm (pretrained=/root/autodl-tmp/799-1-256,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.886	±	0.0142
		strict-match	5	exact_match	↑	0.852	±	0.0159

vllm (pretrained=/root/autodl-tmp/799-1-256,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7778	±	0.0132
- humanities	2	none	acc	↑	0.8154	±	0.0253
- other	2	none	acc	↑	0.7846	±	0.0275
- social sciences	2	none	acc	↑	0.8111	±	0.0284
- stem	2	none	acc	↑	0.7263	±	0.0244

vllm (pretrained=/root/autodl-tmp/86-05-512,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.908	±	0.0183
		strict-match	5	exact_match	↑	0.892	±	0.0197

vllm (pretrained=/root/autodl-tmp/86-05-512,add_bos_token=true,max_model_len=3096,dtype=bfloat16), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
gsm8k	3	flexible-extract	5	exact_match	↑	0.91	±	0.0128
		strict-match	5	exact_match	↑	0.89	±	0.0140

vllm (pretrained=/root/autodl-tmp/86-05-512,add_bos_token=true,max_model_len=3048,dtype=bfloat16), gen_kwargs: (None), limit: 15.0, num_fewshot: None, batch_size: 1

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.7801	±	0.0132
- humanities	2	none	acc	↑	0.8308	±	0.0250
- other	2	none	acc	↑	0.7897	±	0.0273
- social sciences	2	none	acc	↑	0.8167	±	0.0279
- stem	2	none	acc	↑	0.7158	±	0.0249

noneUsername
/

shisa-v2-mistral-small-24b-W8A8

Model tree for noneUsername/shisa-v2-mistral-small-24b-W8A8