Feature Extraction
Transformers
Safetensors
English
Chinese
emova
Omni-modal-LLM
Multi-modal-LLM
Emotional-spoken-dialogue
custom_code
Eval Results
zhili-liu commited on
Commit
8152a02
·
verified ·
1 Parent(s): 4eb53af

Upload EMOVAForConditionalGeneration

Browse files
Files changed (3) hide show
  1. README.md +4 -4
  2. config.json +52 -52
  3. generation_config.json +1 -1
README.md CHANGED
@@ -159,19 +159,19 @@ model-index:
159
  name: accuracy
160
  verified: true
161
  - task:
162
- name: Automatic Speech Recognition
163
  type: automatic-speech-recognition
 
164
  dataset:
165
  name: LibriSpeech (clean)
166
  type: librispeech_asr
167
  config: clean
168
  split: test
169
- args:
170
  language: en
171
  metrics:
172
- - name: Test WER
173
- type: wer
174
  value: 5.4
 
175
  ---
176
 
177
  # EMOVA-Qwen-2.5-3B-HF
 
159
  name: accuracy
160
  verified: true
161
  - task:
 
162
  type: automatic-speech-recognition
163
+ name: Automatic Speech Recognition
164
  dataset:
165
  name: LibriSpeech (clean)
166
  type: librispeech_asr
167
  config: clean
168
  split: test
169
+ args:
170
  language: en
171
  metrics:
172
+ - type: wer
 
173
  value: 5.4
174
+ name: Test WER
175
  ---
176
 
177
  # EMOVA-Qwen-2.5-3B-HF
config.json CHANGED
@@ -1,52 +1,52 @@
1
- {
2
- "architectures": [
3
- "EMOVAForConditionalGeneration"
4
- ],
5
- "auto_map": {
6
- "AutoConfig": "configuration_emova.EMOVAConfig",
7
- "AutoModel": "modeling_emova.EMOVAForConditionalGeneration",
8
- "AutoModelForCausalLM": "modeling_emova.EMOVAForConditionalGeneration"
9
- },
10
- "default_system_prompt": "You are a helpful assistant.",
11
- "ignore_index": -100,
12
- "image_token_index": 155761,
13
- "mm_projector_config": {
14
- "mlp_depth": 2,
15
- "trainable": true,
16
- "type": "MLPProjector"
17
- },
18
- "model_type": "emova",
19
- "text_config": {
20
- "architectures": [
21
- "Qwen2ForCausalLM"
22
- ],
23
- "bos_token_id": 151643,
24
- "eos_token_id": 151645,
25
- "hidden_size": 2048,
26
- "intermediate_size": 11008,
27
- "max_position_embeddings": 32768,
28
- "max_window_layers": 70,
29
- "model_type": "qwen2",
30
- "num_attention_heads": 16,
31
- "num_hidden_layers": 36,
32
- "num_key_value_heads": 2,
33
- "rope_theta": 1000000.0,
34
- "sliding_window": null,
35
- "tie_word_embeddings": true,
36
- "torch_dtype": "float32",
37
- "use_sliding_window": false,
38
- "vocab_size": 155776
39
- },
40
- "tie_word_embeddings": false,
41
- "torch_dtype": "bfloat16",
42
- "transformers_version": "4.47.1",
43
- "vision_config": {
44
- "_attn_implementation_autoset": true,
45
- "max_pixels": 3211264,
46
- "model_type": "qwen2_vl",
47
- "pretrained_model_name_or_path": "Emova-ollm/qwen2vit600m",
48
- "trainable": true,
49
- "type": "Qwen2VisionTower",
50
- "unfreeze_mm_vision_tower": true
51
- }
52
- }
 
1
+ {
2
+ "_name_or_path": "D:\\temp\\emova_data\\checkpoints\\emova_qwen2_5_3b",
3
+ "architectures": [
4
+ "EMOVAForConditionalGeneration"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_emova.EMOVAConfig",
8
+ "AutoModel": "modeling_emova.EMOVAForConditionalGeneration",
9
+ "AutoModelForCausalLM": "modeling_emova.EMOVAForConditionalGeneration"
10
+ },
11
+ "default_system_prompt": "You are a helpful assistant.",
12
+ "ignore_index": -100,
13
+ "image_token_index": 155761,
14
+ "mm_projector_config": {
15
+ "mlp_depth": 2,
16
+ "trainable": true,
17
+ "type": "MLPProjector"
18
+ },
19
+ "model_type": "emova",
20
+ "text_config": {
21
+ "architectures": [
22
+ "Qwen2ForCausalLM"
23
+ ],
24
+ "bos_token_id": 151643,
25
+ "eos_token_id": 151645,
26
+ "hidden_size": 2048,
27
+ "intermediate_size": 11008,
28
+ "max_position_embeddings": 32768,
29
+ "max_window_layers": 70,
30
+ "model_type": "qwen2",
31
+ "num_attention_heads": 16,
32
+ "num_hidden_layers": 36,
33
+ "num_key_value_heads": 2,
34
+ "rope_theta": 1000000.0,
35
+ "sliding_window": null,
36
+ "tie_word_embeddings": true,
37
+ "torch_dtype": "float32",
38
+ "use_sliding_window": false,
39
+ "vocab_size": 155776
40
+ },
41
+ "tie_word_embeddings": false,
42
+ "torch_dtype": "bfloat16",
43
+ "transformers_version": "4.44.2",
44
+ "vision_config": {
45
+ "max_pixels": 3211264,
46
+ "model_type": "qwen2_vl",
47
+ "pretrained_model_name_or_path": "Emova-ollm/qwen2vit600m",
48
+ "trainable": true,
49
+ "type": "Qwen2VisionTower",
50
+ "unfreeze_mm_vision_tower": true
51
+ }
52
+ }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 151643,
4
  "eos_token_id": 151645,
5
- "transformers_version": "4.47.1"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 151643,
4
  "eos_token_id": 151645,
5
+ "transformers_version": "4.44.2"
6
  }