pingzhili commited on
Commit
ebd1d22
·
1 Parent(s): 9509942
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. olmoe-distill-apps/config.json +33 -0
  2. olmoe-distill-apps/generation_config.json +6 -0
  3. olmoe-distill-apps/model-00001-of-00003.safetensors +3 -0
  4. olmoe-distill-apps/model-00002-of-00003.safetensors +3 -0
  5. olmoe-distill-apps/model-00003-of-00003.safetensors +3 -0
  6. olmoe-distill-apps/model.safetensors.index.json +0 -0
  7. olmoe-distill-camelai_biology/config.json +33 -0
  8. olmoe-distill-camelai_biology/generation_config.json +6 -0
  9. olmoe-distill-camelai_biology/model-00001-of-00003.safetensors +3 -0
  10. olmoe-distill-camelai_biology/model-00002-of-00003.safetensors +3 -0
  11. olmoe-distill-camelai_biology/model-00003-of-00003.safetensors +3 -0
  12. olmoe-distill-camelai_biology/model.safetensors.index.json +0 -0
  13. olmoe-distill-camelai_chemistry/config.json +33 -0
  14. olmoe-distill-camelai_chemistry/generation_config.json +6 -0
  15. olmoe-distill-camelai_chemistry/model-00001-of-00003.safetensors +3 -0
  16. olmoe-distill-camelai_chemistry/model-00002-of-00003.safetensors +3 -0
  17. olmoe-distill-camelai_chemistry/model-00003-of-00003.safetensors +3 -0
  18. olmoe-distill-camelai_chemistry/model.safetensors.index.json +0 -0
  19. olmoe-distill-camelai_physics/config.json +33 -0
  20. olmoe-distill-camelai_physics/generation_config.json +6 -0
  21. olmoe-distill-camelai_physics/model-00001-of-00003.safetensors +3 -0
  22. olmoe-distill-camelai_physics/model-00002-of-00003.safetensors +3 -0
  23. olmoe-distill-camelai_physics/model-00003-of-00003.safetensors +3 -0
  24. olmoe-distill-camelai_physics/model.safetensors.index.json +0 -0
  25. olmoe-distill-code_contests/config.json +33 -0
  26. olmoe-distill-code_contests/generation_config.json +6 -0
  27. olmoe-distill-code_contests/model-00001-of-00003.safetensors +3 -0
  28. olmoe-distill-code_contests/model-00002-of-00003.safetensors +3 -0
  29. olmoe-distill-code_contests/model-00003-of-00003.safetensors +3 -0
  30. olmoe-distill-code_contests/model.safetensors.index.json +0 -0
  31. olmoe-distill-codeforces/config.json +33 -0
  32. olmoe-distill-codeforces/generation_config.json +6 -0
  33. olmoe-distill-codeforces/model-00001-of-00003.safetensors +3 -0
  34. olmoe-distill-codeforces/model-00002-of-00003.safetensors +3 -0
  35. olmoe-distill-codeforces/model-00003-of-00003.safetensors +3 -0
  36. olmoe-distill-codeforces/model.safetensors.index.json +0 -0
  37. olmoe-distill-numina_math/config.json +33 -0
  38. olmoe-distill-numina_math/generation_config.json +6 -0
  39. olmoe-distill-numina_math/model-00001-of-00003.safetensors +3 -0
  40. olmoe-distill-numina_math/model-00002-of-00003.safetensors +3 -0
  41. olmoe-distill-numina_math/model-00003-of-00003.safetensors +3 -0
  42. olmoe-distill-numina_math/model.safetensors.index.json +0 -0
  43. olmoe-distill-riddle_sense/config.json +33 -0
  44. olmoe-distill-riddle_sense/generation_config.json +6 -0
  45. olmoe-distill-riddle_sense/model-00001-of-00003.safetensors +3 -0
  46. olmoe-distill-riddle_sense/model-00002-of-00003.safetensors +3 -0
  47. olmoe-distill-riddle_sense/model-00003-of-00003.safetensors +3 -0
  48. olmoe-distill-riddle_sense/model.safetensors.index.json +0 -0
  49. olmoe-distill-taco/config.json +33 -0
  50. olmoe-distill-taco/generation_config.json +6 -0
olmoe-distill-apps/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "allenai/OLMoE-1B-7B-0125",
3
+ "architectures": [
4
+ "OlmoeForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "clip_qkv": null,
9
+ "eos_token_id": 50279,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 4096,
15
+ "model_type": "olmoe",
16
+ "norm_topk_prob": false,
17
+ "num_attention_heads": 16,
18
+ "num_experts": 64,
19
+ "num_experts_per_tok": 8,
20
+ "num_hidden_layers": 16,
21
+ "num_key_value_heads": 16,
22
+ "output_router_logits": false,
23
+ "pad_token_id": 1,
24
+ "rms_norm_eps": 1e-05,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000.0,
27
+ "router_aux_loss_coef": 0.01,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.49.0",
31
+ "use_cache": true,
32
+ "vocab_size": 50304
33
+ }
olmoe-distill-apps/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 50279,
4
+ "pad_token_id": 1,
5
+ "transformers_version": "4.49.0"
6
+ }
olmoe-distill-apps/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e5f36128fca281ab4ad1fa498be6ccefb8044945c1f5934f2ea765df45ba024
3
+ size 4997743728
olmoe-distill-apps/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e2729adf32367ed0ce94ec27cd91bc0f7dcb12582f4aaae6bf60f65ad3af614
3
+ size 4997233976
olmoe-distill-apps/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6101c932f1146ec6cb17b8fb1897bd183d1a4bfedfb3ac0e3e2f70c89ae5063
3
+ size 3843741032
olmoe-distill-apps/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
olmoe-distill-camelai_biology/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "allenai/OLMoE-1B-7B-0125",
3
+ "architectures": [
4
+ "OlmoeForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "clip_qkv": null,
9
+ "eos_token_id": 50279,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 4096,
15
+ "model_type": "olmoe",
16
+ "norm_topk_prob": false,
17
+ "num_attention_heads": 16,
18
+ "num_experts": 64,
19
+ "num_experts_per_tok": 8,
20
+ "num_hidden_layers": 16,
21
+ "num_key_value_heads": 16,
22
+ "output_router_logits": false,
23
+ "pad_token_id": 1,
24
+ "rms_norm_eps": 1e-05,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000.0,
27
+ "router_aux_loss_coef": 0.01,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.49.0",
31
+ "use_cache": true,
32
+ "vocab_size": 50304
33
+ }
olmoe-distill-camelai_biology/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 50279,
4
+ "pad_token_id": 1,
5
+ "transformers_version": "4.49.0"
6
+ }
olmoe-distill-camelai_biology/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fd6bd55ad07f5460c9dec0a6158704d2cfe1ae552e979aa8c65a91ed103df48
3
+ size 4997743728
olmoe-distill-camelai_biology/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33b91a1ed632be4847ef3685d8620fc8374d19c7a355ccfc879cef878a68717f
3
+ size 4997233976
olmoe-distill-camelai_biology/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31a17732387a98b3a96068577d2fb9d7c94197d81e65e6447117570e994b1508
3
+ size 3843741032
olmoe-distill-camelai_biology/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
olmoe-distill-camelai_chemistry/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "allenai/OLMoE-1B-7B-0125",
3
+ "architectures": [
4
+ "OlmoeForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "clip_qkv": null,
9
+ "eos_token_id": 50279,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 4096,
15
+ "model_type": "olmoe",
16
+ "norm_topk_prob": false,
17
+ "num_attention_heads": 16,
18
+ "num_experts": 64,
19
+ "num_experts_per_tok": 8,
20
+ "num_hidden_layers": 16,
21
+ "num_key_value_heads": 16,
22
+ "output_router_logits": false,
23
+ "pad_token_id": 1,
24
+ "rms_norm_eps": 1e-05,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000.0,
27
+ "router_aux_loss_coef": 0.01,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.49.0",
31
+ "use_cache": true,
32
+ "vocab_size": 50304
33
+ }
olmoe-distill-camelai_chemistry/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 50279,
4
+ "pad_token_id": 1,
5
+ "transformers_version": "4.49.0"
6
+ }
olmoe-distill-camelai_chemistry/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1c9684d4d17aa78e0b3f212174193d7cbbf2c5f8e5d0515bcbfc7eb202c6d86
3
+ size 4997743728
olmoe-distill-camelai_chemistry/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:448c7ea2a9e85f77391481aa95d0af68d4a3197b2a10ce9683b0151fc96fe178
3
+ size 4997233976
olmoe-distill-camelai_chemistry/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b0f12b19ecd5d38ab96fbe2f67dcf53e9df68935aae879af277452cfc8a9d8c
3
+ size 3843741032
olmoe-distill-camelai_chemistry/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
olmoe-distill-camelai_physics/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "allenai/OLMoE-1B-7B-0125",
3
+ "architectures": [
4
+ "OlmoeForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "clip_qkv": null,
9
+ "eos_token_id": 50279,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 4096,
15
+ "model_type": "olmoe",
16
+ "norm_topk_prob": false,
17
+ "num_attention_heads": 16,
18
+ "num_experts": 64,
19
+ "num_experts_per_tok": 8,
20
+ "num_hidden_layers": 16,
21
+ "num_key_value_heads": 16,
22
+ "output_router_logits": false,
23
+ "pad_token_id": 1,
24
+ "rms_norm_eps": 1e-05,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000.0,
27
+ "router_aux_loss_coef": 0.01,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.49.0",
31
+ "use_cache": true,
32
+ "vocab_size": 50304
33
+ }
olmoe-distill-camelai_physics/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 50279,
4
+ "pad_token_id": 1,
5
+ "transformers_version": "4.49.0"
6
+ }
olmoe-distill-camelai_physics/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c54438ae3f01c522336ffe3e46475e7c22d372eff80dcd9fe57ff10ff3fff903
3
+ size 4997743728
olmoe-distill-camelai_physics/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b43f696bf7d169640439470dc8623d181b4ce7cf61f8136e11c660fab2e2a5f
3
+ size 4997233976
olmoe-distill-camelai_physics/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5a21f33aa9d0548428a5036d64138f85257de582c69b080d1631608704f062a
3
+ size 3843741032
olmoe-distill-camelai_physics/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
olmoe-distill-code_contests/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "allenai/OLMoE-1B-7B-0125",
3
+ "architectures": [
4
+ "OlmoeForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "clip_qkv": null,
9
+ "eos_token_id": 50279,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 4096,
15
+ "model_type": "olmoe",
16
+ "norm_topk_prob": false,
17
+ "num_attention_heads": 16,
18
+ "num_experts": 64,
19
+ "num_experts_per_tok": 8,
20
+ "num_hidden_layers": 16,
21
+ "num_key_value_heads": 16,
22
+ "output_router_logits": false,
23
+ "pad_token_id": 1,
24
+ "rms_norm_eps": 1e-05,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000.0,
27
+ "router_aux_loss_coef": 0.01,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.49.0",
31
+ "use_cache": true,
32
+ "vocab_size": 50304
33
+ }
olmoe-distill-code_contests/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 50279,
4
+ "pad_token_id": 1,
5
+ "transformers_version": "4.49.0"
6
+ }
olmoe-distill-code_contests/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32a2ba6b4a4fe160163835a4bead41dd463eeeaa587f63c9ba963fe461e874ef
3
+ size 4997743728
olmoe-distill-code_contests/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6880dab1368e4094da7b71d02f64ce7a859ed498813ab3c06c002d87cfba8a1e
3
+ size 4997233976
olmoe-distill-code_contests/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ddc9804d736d0a01a72ae62195ec0b641935c7e0afc7a616954e774d59bbd8a
3
+ size 3843741032
olmoe-distill-code_contests/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
olmoe-distill-codeforces/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "allenai/OLMoE-1B-7B-0125",
3
+ "architectures": [
4
+ "OlmoeForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "clip_qkv": null,
9
+ "eos_token_id": 50279,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 4096,
15
+ "model_type": "olmoe",
16
+ "norm_topk_prob": false,
17
+ "num_attention_heads": 16,
18
+ "num_experts": 64,
19
+ "num_experts_per_tok": 8,
20
+ "num_hidden_layers": 16,
21
+ "num_key_value_heads": 16,
22
+ "output_router_logits": false,
23
+ "pad_token_id": 1,
24
+ "rms_norm_eps": 1e-05,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000.0,
27
+ "router_aux_loss_coef": 0.01,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.49.0",
31
+ "use_cache": true,
32
+ "vocab_size": 50304
33
+ }
olmoe-distill-codeforces/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 50279,
4
+ "pad_token_id": 1,
5
+ "transformers_version": "4.49.0"
6
+ }
olmoe-distill-codeforces/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ea930a0e55b7a2a1e31b96629ffa3c3bdd04c1f38f620b5e9a0ae88515d89da
3
+ size 4997743728
olmoe-distill-codeforces/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e939e9d2fae980372265980c51d599b4238ea73ac0d2f295678a6bd91028e00
3
+ size 4997233976
olmoe-distill-codeforces/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef5a2e4f9326791d85a6955f55e3e869ba0998710b2d5b2d56b591f025c44586
3
+ size 3843741032
olmoe-distill-codeforces/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
olmoe-distill-numina_math/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "allenai/OLMoE-1B-7B-0125",
3
+ "architectures": [
4
+ "OlmoeForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "clip_qkv": null,
9
+ "eos_token_id": 50279,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 4096,
15
+ "model_type": "olmoe",
16
+ "norm_topk_prob": false,
17
+ "num_attention_heads": 16,
18
+ "num_experts": 64,
19
+ "num_experts_per_tok": 8,
20
+ "num_hidden_layers": 16,
21
+ "num_key_value_heads": 16,
22
+ "output_router_logits": false,
23
+ "pad_token_id": 1,
24
+ "rms_norm_eps": 1e-05,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000.0,
27
+ "router_aux_loss_coef": 0.01,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.49.0",
31
+ "use_cache": true,
32
+ "vocab_size": 50304
33
+ }
olmoe-distill-numina_math/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 50279,
4
+ "pad_token_id": 1,
5
+ "transformers_version": "4.49.0"
6
+ }
olmoe-distill-numina_math/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef72e6e72aa5ad16df92804057efd04b5291cef1b1658c04773a64f1f9af858e
3
+ size 4997743728
olmoe-distill-numina_math/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b102adae4a5e665db3be061fb69ce0e435b217bbb67b6a7f185a36deb25bde12
3
+ size 4997233976
olmoe-distill-numina_math/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a9172000524559e4a07388783f180d50f8de0f6dffafb308d6fadb89ff15c47
3
+ size 3843741032
olmoe-distill-numina_math/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
olmoe-distill-riddle_sense/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "allenai/OLMoE-1B-7B-0125",
3
+ "architectures": [
4
+ "OlmoeForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "clip_qkv": null,
9
+ "eos_token_id": 50279,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 4096,
15
+ "model_type": "olmoe",
16
+ "norm_topk_prob": false,
17
+ "num_attention_heads": 16,
18
+ "num_experts": 64,
19
+ "num_experts_per_tok": 8,
20
+ "num_hidden_layers": 16,
21
+ "num_key_value_heads": 16,
22
+ "output_router_logits": false,
23
+ "pad_token_id": 1,
24
+ "rms_norm_eps": 1e-05,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000.0,
27
+ "router_aux_loss_coef": 0.01,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.49.0",
31
+ "use_cache": true,
32
+ "vocab_size": 50304
33
+ }
olmoe-distill-riddle_sense/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 50279,
4
+ "pad_token_id": 1,
5
+ "transformers_version": "4.49.0"
6
+ }
olmoe-distill-riddle_sense/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76ae70f13a01023cde6dda52c3e818c05ad4157415450f7ba9ab296bb49128fa
3
+ size 4997743728
olmoe-distill-riddle_sense/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46f9363a7d2ec64236c9abeebc106e93588bbc4c7dcd464db9a4dff5be1a321c
3
+ size 4997233976
olmoe-distill-riddle_sense/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:835d5057d04320b3d2b7e3245354a9da6afefc79a420e9d7490ba8a85e2e7e10
3
+ size 3843741032
olmoe-distill-riddle_sense/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
olmoe-distill-taco/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "allenai/OLMoE-1B-7B-0125",
3
+ "architectures": [
4
+ "OlmoeForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "clip_qkv": null,
9
+ "eos_token_id": 50279,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1024,
14
+ "max_position_embeddings": 4096,
15
+ "model_type": "olmoe",
16
+ "norm_topk_prob": false,
17
+ "num_attention_heads": 16,
18
+ "num_experts": 64,
19
+ "num_experts_per_tok": 8,
20
+ "num_hidden_layers": 16,
21
+ "num_key_value_heads": 16,
22
+ "output_router_logits": false,
23
+ "pad_token_id": 1,
24
+ "rms_norm_eps": 1e-05,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000.0,
27
+ "router_aux_loss_coef": 0.01,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.49.0",
31
+ "use_cache": true,
32
+ "vocab_size": 50304
33
+ }
olmoe-distill-taco/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 50279,
4
+ "pad_token_id": 1,
5
+ "transformers_version": "4.49.0"
6
+ }