vincentchao commited on
Commit
9106cdb
·
verified ·
1 Parent(s): 7371e76

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/common/home/users/w/wzhao/qllava/vqllava3_finetune_on_filted_dataset_new/checkpoint-2131",
3
+ "architectures": [
4
+ "LlavaNextForConditionalGeneration"
5
+ ],
6
+ "ignore_index": -100,
7
+ "image_grid_pinpoints": [
8
+ [
9
+ 336,
10
+ 672
11
+ ],
12
+ [
13
+ 672,
14
+ 336
15
+ ],
16
+ [
17
+ 672,
18
+ 672
19
+ ],
20
+ [
21
+ 1008,
22
+ 336
23
+ ],
24
+ [
25
+ 336,
26
+ 1008
27
+ ]
28
+ ],
29
+ "image_seq_length": 576,
30
+ "image_token_index": 128256,
31
+ "model_type": "llava_next",
32
+ "projector_hidden_act": "gelu",
33
+ "text_config": {
34
+ "_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct",
35
+ "architectures": [
36
+ "LlamaForCausalLM"
37
+ ],
38
+ "bos_token_id": 128000,
39
+ "eos_token_id": 128009,
40
+ "intermediate_size": 14336,
41
+ "max_position_embeddings": 8192,
42
+ "model_type": "llama",
43
+ "num_key_value_heads": 8,
44
+ "rms_norm_eps": 1e-05,
45
+ "rope_theta": 500000.0,
46
+ "torch_dtype": "bfloat16",
47
+ "vocab_size": 128320
48
+ },
49
+ "tie_word_embeddings": false,
50
+ "torch_dtype": "float32",
51
+ "transformers_version": "4.46.1",
52
+ "use_image_newline_parameter": true,
53
+ "vision_config": {
54
+ "hidden_size": 1024,
55
+ "image_size": 336,
56
+ "intermediate_size": 4096,
57
+ "model_type": "clip_vision_model",
58
+ "num_attention_heads": 16,
59
+ "num_hidden_layers": 24,
60
+ "patch_size": 14,
61
+ "projection_dim": 768,
62
+ "vocab_size": 32000
63
+ },
64
+ "vision_feature_layer": -2,
65
+ "vision_feature_select_strategy": "default"
66
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 128000,
4
+ "eos_token_id": 128009,
5
+ "transformers_version": "4.46.1"
6
+ }
model-00001-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:316a0b931dd6da3341cbd4f6beb6bbd07b347278d61f03e5bdda86fa050b7e9d
3
+ size 4939760848
model-00002-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fbda1cd72808cce11e02dfe6950690561589bc828b35e0853d26978301382e2
3
+ size 4999813864
model-00003-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84a0f8a111d4d37c215122a185b6cfda626456b618a9ee94c3855fee778f1ff0
3
+ size 4832008176
model-00004-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:486029793e97d9610fc0c2819b8b1785eff787ec1bee69df4380f8f911dcfa62
3
+ size 4999813920
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c133428f118b4a785d0c2763544b5694735c3ea90f660ff08c3bbcdf9569627
3
+ size 4999813920
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59027fb4ae13d7c242b5cfad81feebdf645499338dc6a7885a548f7ef2228d54
3
+ size 4832008200
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b0ea2e7d880e4103b58fb3bb8f2497f1c3118cba658dc32dc1987aaa5d61a17
3
+ size 4082224048
model.safetensors.index.json ADDED
@@ -0,0 +1,698 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 33685350400
4
+ },
5
+ "weight_map": {
6
+ "image_newline": "model-00001-of-00007.safetensors",
7
+ "language_model.lm_head.weight": "model-00007-of-00007.safetensors",
8
+ "language_model.model.embed_tokens.weight": "model-00001-of-00007.safetensors",
9
+ "language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
10
+ "language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
11
+ "language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
12
+ "language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
13
+ "language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
14
+ "language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
15
+ "language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
16
+ "language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
17
+ "language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
18
+ "language_model.model.layers.1.input_layernorm.weight": "model-00002-of-00007.safetensors",
19
+ "language_model.model.layers.1.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
20
+ "language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
21
+ "language_model.model.layers.1.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
22
+ "language_model.model.layers.1.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
23
+ "language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
24
+ "language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
25
+ "language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
26
+ "language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
27
+ "language_model.model.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
28
+ "language_model.model.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
29
+ "language_model.model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
30
+ "language_model.model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
31
+ "language_model.model.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
32
+ "language_model.model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
33
+ "language_model.model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
34
+ "language_model.model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
35
+ "language_model.model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
36
+ "language_model.model.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
37
+ "language_model.model.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
38
+ "language_model.model.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
39
+ "language_model.model.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
40
+ "language_model.model.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
41
+ "language_model.model.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
42
+ "language_model.model.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
43
+ "language_model.model.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
44
+ "language_model.model.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
45
+ "language_model.model.layers.12.input_layernorm.weight": "model-00004-of-00007.safetensors",
46
+ "language_model.model.layers.12.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
47
+ "language_model.model.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
48
+ "language_model.model.layers.12.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
49
+ "language_model.model.layers.12.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
50
+ "language_model.model.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
51
+ "language_model.model.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
52
+ "language_model.model.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
53
+ "language_model.model.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
54
+ "language_model.model.layers.13.input_layernorm.weight": "model-00004-of-00007.safetensors",
55
+ "language_model.model.layers.13.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
56
+ "language_model.model.layers.13.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
57
+ "language_model.model.layers.13.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
58
+ "language_model.model.layers.13.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
59
+ "language_model.model.layers.13.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
60
+ "language_model.model.layers.13.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
61
+ "language_model.model.layers.13.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
62
+ "language_model.model.layers.13.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
63
+ "language_model.model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
64
+ "language_model.model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
65
+ "language_model.model.layers.14.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
66
+ "language_model.model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
67
+ "language_model.model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
68
+ "language_model.model.layers.14.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
69
+ "language_model.model.layers.14.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
70
+ "language_model.model.layers.14.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
71
+ "language_model.model.layers.14.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
72
+ "language_model.model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
73
+ "language_model.model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
74
+ "language_model.model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
75
+ "language_model.model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
76
+ "language_model.model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
77
+ "language_model.model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
78
+ "language_model.model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
79
+ "language_model.model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
80
+ "language_model.model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
81
+ "language_model.model.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
82
+ "language_model.model.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
83
+ "language_model.model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
84
+ "language_model.model.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
85
+ "language_model.model.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
86
+ "language_model.model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
87
+ "language_model.model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
88
+ "language_model.model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
89
+ "language_model.model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
90
+ "language_model.model.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
91
+ "language_model.model.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
92
+ "language_model.model.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
93
+ "language_model.model.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
94
+ "language_model.model.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
95
+ "language_model.model.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
96
+ "language_model.model.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
97
+ "language_model.model.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
98
+ "language_model.model.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
99
+ "language_model.model.layers.18.input_layernorm.weight": "model-00005-of-00007.safetensors",
100
+ "language_model.model.layers.18.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
101
+ "language_model.model.layers.18.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
102
+ "language_model.model.layers.18.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
103
+ "language_model.model.layers.18.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
104
+ "language_model.model.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
105
+ "language_model.model.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
106
+ "language_model.model.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
107
+ "language_model.model.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
108
+ "language_model.model.layers.19.input_layernorm.weight": "model-00005-of-00007.safetensors",
109
+ "language_model.model.layers.19.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
110
+ "language_model.model.layers.19.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
111
+ "language_model.model.layers.19.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
112
+ "language_model.model.layers.19.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
113
+ "language_model.model.layers.19.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
114
+ "language_model.model.layers.19.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
115
+ "language_model.model.layers.19.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
116
+ "language_model.model.layers.19.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
117
+ "language_model.model.layers.2.input_layernorm.weight": "model-00002-of-00007.safetensors",
118
+ "language_model.model.layers.2.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
119
+ "language_model.model.layers.2.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
120
+ "language_model.model.layers.2.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
121
+ "language_model.model.layers.2.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
122
+ "language_model.model.layers.2.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
123
+ "language_model.model.layers.2.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
124
+ "language_model.model.layers.2.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
125
+ "language_model.model.layers.2.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
126
+ "language_model.model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
127
+ "language_model.model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
128
+ "language_model.model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
129
+ "language_model.model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
130
+ "language_model.model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
131
+ "language_model.model.layers.20.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
132
+ "language_model.model.layers.20.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
133
+ "language_model.model.layers.20.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
134
+ "language_model.model.layers.20.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
135
+ "language_model.model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
136
+ "language_model.model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
137
+ "language_model.model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
138
+ "language_model.model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
139
+ "language_model.model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
140
+ "language_model.model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
141
+ "language_model.model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
142
+ "language_model.model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
143
+ "language_model.model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
144
+ "language_model.model.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
145
+ "language_model.model.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
146
+ "language_model.model.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
147
+ "language_model.model.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
148
+ "language_model.model.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
149
+ "language_model.model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
150
+ "language_model.model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
151
+ "language_model.model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
152
+ "language_model.model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
153
+ "language_model.model.layers.23.input_layernorm.weight": "model-00005-of-00007.safetensors",
154
+ "language_model.model.layers.23.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
155
+ "language_model.model.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
156
+ "language_model.model.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
157
+ "language_model.model.layers.23.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
158
+ "language_model.model.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
159
+ "language_model.model.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
160
+ "language_model.model.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
161
+ "language_model.model.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
162
+ "language_model.model.layers.24.input_layernorm.weight": "model-00006-of-00007.safetensors",
163
+ "language_model.model.layers.24.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
164
+ "language_model.model.layers.24.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
165
+ "language_model.model.layers.24.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
166
+ "language_model.model.layers.24.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
167
+ "language_model.model.layers.24.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
168
+ "language_model.model.layers.24.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
169
+ "language_model.model.layers.24.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
170
+ "language_model.model.layers.24.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
171
+ "language_model.model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
172
+ "language_model.model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
173
+ "language_model.model.layers.25.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
174
+ "language_model.model.layers.25.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
175
+ "language_model.model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
176
+ "language_model.model.layers.25.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
177
+ "language_model.model.layers.25.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
178
+ "language_model.model.layers.25.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
179
+ "language_model.model.layers.25.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
180
+ "language_model.model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
181
+ "language_model.model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
182
+ "language_model.model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
183
+ "language_model.model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
184
+ "language_model.model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
185
+ "language_model.model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
186
+ "language_model.model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
187
+ "language_model.model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
188
+ "language_model.model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
189
+ "language_model.model.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
190
+ "language_model.model.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
191
+ "language_model.model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
192
+ "language_model.model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
193
+ "language_model.model.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
194
+ "language_model.model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
195
+ "language_model.model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
196
+ "language_model.model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
197
+ "language_model.model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
198
+ "language_model.model.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors",
199
+ "language_model.model.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
200
+ "language_model.model.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
201
+ "language_model.model.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
202
+ "language_model.model.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
203
+ "language_model.model.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
204
+ "language_model.model.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
205
+ "language_model.model.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
206
+ "language_model.model.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
207
+ "language_model.model.layers.29.input_layernorm.weight": "model-00007-of-00007.safetensors",
208
+ "language_model.model.layers.29.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
209
+ "language_model.model.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
210
+ "language_model.model.layers.29.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
211
+ "language_model.model.layers.29.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
212
+ "language_model.model.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
213
+ "language_model.model.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
214
+ "language_model.model.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
215
+ "language_model.model.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
216
+ "language_model.model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
217
+ "language_model.model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
218
+ "language_model.model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
219
+ "language_model.model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
220
+ "language_model.model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
221
+ "language_model.model.layers.3.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
222
+ "language_model.model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
223
+ "language_model.model.layers.3.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
224
+ "language_model.model.layers.3.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
225
+ "language_model.model.layers.30.input_layernorm.weight": "model-00007-of-00007.safetensors",
226
+ "language_model.model.layers.30.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
227
+ "language_model.model.layers.30.mlp.gate_proj.weight": "model-00007-of-00007.safetensors",
228
+ "language_model.model.layers.30.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
229
+ "language_model.model.layers.30.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
230
+ "language_model.model.layers.30.self_attn.k_proj.weight": "model-00007-of-00007.safetensors",
231
+ "language_model.model.layers.30.self_attn.o_proj.weight": "model-00007-of-00007.safetensors",
232
+ "language_model.model.layers.30.self_attn.q_proj.weight": "model-00007-of-00007.safetensors",
233
+ "language_model.model.layers.30.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
234
+ "language_model.model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors",
235
+ "language_model.model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
236
+ "language_model.model.layers.31.mlp.gate_proj.weight": "model-00007-of-00007.safetensors",
237
+ "language_model.model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
238
+ "language_model.model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
239
+ "language_model.model.layers.31.self_attn.k_proj.weight": "model-00007-of-00007.safetensors",
240
+ "language_model.model.layers.31.self_attn.o_proj.weight": "model-00007-of-00007.safetensors",
241
+ "language_model.model.layers.31.self_attn.q_proj.weight": "model-00007-of-00007.safetensors",
242
+ "language_model.model.layers.31.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
243
+ "language_model.model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
244
+ "language_model.model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
245
+ "language_model.model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
246
+ "language_model.model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
247
+ "language_model.model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
248
+ "language_model.model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
249
+ "language_model.model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
250
+ "language_model.model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
251
+ "language_model.model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
252
+ "language_model.model.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
253
+ "language_model.model.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
254
+ "language_model.model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
255
+ "language_model.model.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
256
+ "language_model.model.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
257
+ "language_model.model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
258
+ "language_model.model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
259
+ "language_model.model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
260
+ "language_model.model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
261
+ "language_model.model.layers.6.input_layernorm.weight": "model-00002-of-00007.safetensors",
262
+ "language_model.model.layers.6.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
263
+ "language_model.model.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
264
+ "language_model.model.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
265
+ "language_model.model.layers.6.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
266
+ "language_model.model.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
267
+ "language_model.model.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
268
+ "language_model.model.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
269
+ "language_model.model.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
270
+ "language_model.model.layers.7.input_layernorm.weight": "model-00003-of-00007.safetensors",
271
+ "language_model.model.layers.7.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
272
+ "language_model.model.layers.7.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
273
+ "language_model.model.layers.7.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
274
+ "language_model.model.layers.7.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
275
+ "language_model.model.layers.7.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
276
+ "language_model.model.layers.7.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
277
+ "language_model.model.layers.7.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
278
+ "language_model.model.layers.7.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
279
+ "language_model.model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
280
+ "language_model.model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
281
+ "language_model.model.layers.8.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
282
+ "language_model.model.layers.8.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
283
+ "language_model.model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
284
+ "language_model.model.layers.8.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
285
+ "language_model.model.layers.8.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
286
+ "language_model.model.layers.8.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
287
+ "language_model.model.layers.8.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
288
+ "language_model.model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
289
+ "language_model.model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
290
+ "language_model.model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
291
+ "language_model.model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
292
+ "language_model.model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
293
+ "language_model.model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
294
+ "language_model.model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
295
+ "language_model.model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
296
+ "language_model.model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
297
+ "language_model.model.norm.weight": "model-00007-of-00007.safetensors",
298
+ "multi_modal_projector.linear_1.bias": "model-00001-of-00007.safetensors",
299
+ "multi_modal_projector.linear_1.weight": "model-00001-of-00007.safetensors",
300
+ "multi_modal_projector.linear_2.bias": "model-00001-of-00007.safetensors",
301
+ "multi_modal_projector.linear_2.weight": "model-00001-of-00007.safetensors",
302
+ "multi_modal_projector.vq.embedding.weight": "model-00001-of-00007.safetensors",
303
+ "multi_modal_projector.vq_cls._category_mapping_indices": "model-00001-of-00007.safetensors",
304
+ "multi_modal_projector.vq_cls._category_mapping_names": "model-00001-of-00007.safetensors",
305
+ "multi_modal_projector.vq_cls.embedding.weight": "model-00001-of-00007.safetensors",
306
+ "vision_tower.vision_model.embeddings.class_embedding": "model-00001-of-00007.safetensors",
307
+ "vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00007.safetensors",
308
+ "vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00007.safetensors",
309
+ "vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00007.safetensors",
310
+ "vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00007.safetensors",
311
+ "vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00007.safetensors",
312
+ "vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00007.safetensors",
313
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00007.safetensors",
314
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00007.safetensors",
315
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00007.safetensors",
316
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00007.safetensors",
317
+ "vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
318
+ "vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
319
+ "vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
320
+ "vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
321
+ "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
322
+ "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
323
+ "vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
324
+ "vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
325
+ "vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00007.safetensors",
326
+ "vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00007.safetensors",
327
+ "vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00007.safetensors",
328
+ "vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00007.safetensors",
329
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00007.safetensors",
330
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00007.safetensors",
331
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00007.safetensors",
332
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00007.safetensors",
333
+ "vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
334
+ "vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
335
+ "vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
336
+ "vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
337
+ "vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
338
+ "vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
339
+ "vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
340
+ "vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
341
+ "vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00007.safetensors",
342
+ "vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00007.safetensors",
343
+ "vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00007.safetensors",
344
+ "vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00007.safetensors",
345
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00007.safetensors",
346
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00007.safetensors",
347
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00007.safetensors",
348
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00007.safetensors",
349
+ "vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
350
+ "vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
351
+ "vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
352
+ "vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
353
+ "vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
354
+ "vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
355
+ "vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
356
+ "vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
357
+ "vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00007.safetensors",
358
+ "vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00007.safetensors",
359
+ "vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00007.safetensors",
360
+ "vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00007.safetensors",
361
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00007.safetensors",
362
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00007.safetensors",
363
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00007.safetensors",
364
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00007.safetensors",
365
+ "vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
366
+ "vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
367
+ "vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
368
+ "vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
369
+ "vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
370
+ "vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
371
+ "vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
372
+ "vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
373
+ "vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00007.safetensors",
374
+ "vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00007.safetensors",
375
+ "vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00007.safetensors",
376
+ "vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00007.safetensors",
377
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00007.safetensors",
378
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00007.safetensors",
379
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00007.safetensors",
380
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00007.safetensors",
381
+ "vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
382
+ "vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
383
+ "vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
384
+ "vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
385
+ "vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
386
+ "vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
387
+ "vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
388
+ "vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
389
+ "vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00007.safetensors",
390
+ "vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00007.safetensors",
391
+ "vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00007.safetensors",
392
+ "vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00007.safetensors",
393
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00007.safetensors",
394
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00007.safetensors",
395
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00007.safetensors",
396
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00007.safetensors",
397
+ "vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
398
+ "vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
399
+ "vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
400
+ "vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
401
+ "vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
402
+ "vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
403
+ "vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
404
+ "vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
405
+ "vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00007.safetensors",
406
+ "vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00007.safetensors",
407
+ "vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00007.safetensors",
408
+ "vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00007.safetensors",
409
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00007.safetensors",
410
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00007.safetensors",
411
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00007.safetensors",
412
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00007.safetensors",
413
+ "vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
414
+ "vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
415
+ "vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
416
+ "vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
417
+ "vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
418
+ "vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
419
+ "vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
420
+ "vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
421
+ "vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00007.safetensors",
422
+ "vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00007.safetensors",
423
+ "vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00007.safetensors",
424
+ "vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00007.safetensors",
425
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00007.safetensors",
426
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00007.safetensors",
427
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00007.safetensors",
428
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00007.safetensors",
429
+ "vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
430
+ "vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
431
+ "vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
432
+ "vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
433
+ "vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
434
+ "vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
435
+ "vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
436
+ "vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
437
+ "vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00007.safetensors",
438
+ "vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00007.safetensors",
439
+ "vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00007.safetensors",
440
+ "vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00007.safetensors",
441
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00007.safetensors",
442
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00007.safetensors",
443
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00007.safetensors",
444
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00007.safetensors",
445
+ "vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
446
+ "vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
447
+ "vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
448
+ "vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
449
+ "vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
450
+ "vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
451
+ "vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
452
+ "vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
453
+ "vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00007.safetensors",
454
+ "vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00007.safetensors",
455
+ "vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00007.safetensors",
456
+ "vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00007.safetensors",
457
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00007.safetensors",
458
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00007.safetensors",
459
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00007.safetensors",
460
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00007.safetensors",
461
+ "vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
462
+ "vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
463
+ "vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
464
+ "vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
465
+ "vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
466
+ "vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
467
+ "vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
468
+ "vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
469
+ "vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00007.safetensors",
470
+ "vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00007.safetensors",
471
+ "vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00007.safetensors",
472
+ "vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00007.safetensors",
473
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00007.safetensors",
474
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00007.safetensors",
475
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00007.safetensors",
476
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00007.safetensors",
477
+ "vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
478
+ "vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
479
+ "vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
480
+ "vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
481
+ "vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
482
+ "vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
483
+ "vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
484
+ "vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
485
+ "vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00007.safetensors",
486
+ "vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00007.safetensors",
487
+ "vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00007.safetensors",
488
+ "vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00007.safetensors",
489
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00007.safetensors",
490
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00007.safetensors",
491
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00007.safetensors",
492
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00007.safetensors",
493
+ "vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
494
+ "vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
495
+ "vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
496
+ "vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
497
+ "vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
498
+ "vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
499
+ "vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
500
+ "vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
501
+ "vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00007.safetensors",
502
+ "vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00007.safetensors",
503
+ "vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00007.safetensors",
504
+ "vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00007.safetensors",
505
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00007.safetensors",
506
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00007.safetensors",
507
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00007.safetensors",
508
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00007.safetensors",
509
+ "vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
510
+ "vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
511
+ "vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
512
+ "vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
513
+ "vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
514
+ "vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
515
+ "vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
516
+ "vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
517
+ "vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00007.safetensors",
518
+ "vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00007.safetensors",
519
+ "vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00007.safetensors",
520
+ "vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00007.safetensors",
521
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00007.safetensors",
522
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00007.safetensors",
523
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00007.safetensors",
524
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00007.safetensors",
525
+ "vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
526
+ "vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
527
+ "vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
528
+ "vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
529
+ "vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
530
+ "vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
531
+ "vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
532
+ "vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
533
+ "vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00007.safetensors",
534
+ "vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00007.safetensors",
535
+ "vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00007.safetensors",
536
+ "vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00007.safetensors",
537
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00007.safetensors",
538
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00007.safetensors",
539
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00007.safetensors",
540
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00007.safetensors",
541
+ "vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
542
+ "vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
543
+ "vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
544
+ "vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
545
+ "vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
546
+ "vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
547
+ "vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
548
+ "vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
549
+ "vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00007.safetensors",
550
+ "vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00007.safetensors",
551
+ "vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00007.safetensors",
552
+ "vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00007.safetensors",
553
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00007.safetensors",
554
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00007.safetensors",
555
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00007.safetensors",
556
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00007.safetensors",
557
+ "vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
558
+ "vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
559
+ "vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
560
+ "vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
561
+ "vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
562
+ "vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
563
+ "vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
564
+ "vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
565
+ "vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00007.safetensors",
566
+ "vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00007.safetensors",
567
+ "vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00007.safetensors",
568
+ "vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00007.safetensors",
569
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00007.safetensors",
570
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00007.safetensors",
571
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00007.safetensors",
572
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00007.safetensors",
573
+ "vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
574
+ "vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
575
+ "vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
576
+ "vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
577
+ "vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
578
+ "vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
579
+ "vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
580
+ "vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
581
+ "vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00007.safetensors",
582
+ "vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00007.safetensors",
583
+ "vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00007.safetensors",
584
+ "vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00007.safetensors",
585
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00007.safetensors",
586
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00007.safetensors",
587
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00007.safetensors",
588
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00007.safetensors",
589
+ "vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
590
+ "vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
591
+ "vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
592
+ "vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
593
+ "vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
594
+ "vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
595
+ "vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
596
+ "vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
597
+ "vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00007.safetensors",
598
+ "vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00007.safetensors",
599
+ "vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00007.safetensors",
600
+ "vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00007.safetensors",
601
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00007.safetensors",
602
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00007.safetensors",
603
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00007.safetensors",
604
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00007.safetensors",
605
+ "vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
606
+ "vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
607
+ "vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
608
+ "vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
609
+ "vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
610
+ "vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
611
+ "vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
612
+ "vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
613
+ "vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00007.safetensors",
614
+ "vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00007.safetensors",
615
+ "vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00007.safetensors",
616
+ "vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00007.safetensors",
617
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00007.safetensors",
618
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00007.safetensors",
619
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00007.safetensors",
620
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00007.safetensors",
621
+ "vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
622
+ "vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
623
+ "vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
624
+ "vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
625
+ "vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
626
+ "vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
627
+ "vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
628
+ "vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
629
+ "vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00007.safetensors",
630
+ "vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00007.safetensors",
631
+ "vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00007.safetensors",
632
+ "vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00007.safetensors",
633
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00007.safetensors",
634
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00007.safetensors",
635
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00007.safetensors",
636
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00007.safetensors",
637
+ "vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
638
+ "vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
639
+ "vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
640
+ "vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
641
+ "vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
642
+ "vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
643
+ "vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
644
+ "vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
645
+ "vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00007.safetensors",
646
+ "vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00007.safetensors",
647
+ "vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00007.safetensors",
648
+ "vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00007.safetensors",
649
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00007.safetensors",
650
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00007.safetensors",
651
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00007.safetensors",
652
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00007.safetensors",
653
+ "vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
654
+ "vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
655
+ "vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
656
+ "vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
657
+ "vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
658
+ "vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
659
+ "vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
660
+ "vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
661
+ "vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00007.safetensors",
662
+ "vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00007.safetensors",
663
+ "vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00007.safetensors",
664
+ "vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00007.safetensors",
665
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00007.safetensors",
666
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00007.safetensors",
667
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00007.safetensors",
668
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00007.safetensors",
669
+ "vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
670
+ "vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
671
+ "vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
672
+ "vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
673
+ "vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
674
+ "vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
675
+ "vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
676
+ "vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
677
+ "vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00007.safetensors",
678
+ "vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00007.safetensors",
679
+ "vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00007.safetensors",
680
+ "vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00007.safetensors",
681
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00007.safetensors",
682
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00007.safetensors",
683
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00007.safetensors",
684
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00007.safetensors",
685
+ "vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
686
+ "vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
687
+ "vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
688
+ "vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
689
+ "vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
690
+ "vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
691
+ "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
692
+ "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
693
+ "vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00007.safetensors",
694
+ "vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00007.safetensors",
695
+ "vision_tower.vision_model.pre_layrnorm.bias": "model-00001-of-00007.safetensors",
696
+ "vision_tower.vision_model.pre_layrnorm.weight": "model-00001-of-00007.safetensors"
697
+ }
698
+ }
qllava3_test.py ADDED
@@ -0,0 +1,1466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """PyTorch Llava-NeXT model."""
16
+
17
+ import math
18
+ from dataclasses import dataclass
19
+ from typing import List, Optional, Tuple, Union
20
+
21
+ import numpy as np
22
+ import torch
23
+ import torch.utils.checkpoint
24
+ from torch import nn
25
+ import torch.nn.functional as F
26
+
27
+ from transformers.activations import ACT2FN
28
+ from transformers.generation import GenerationMixin
29
+ from transformers.image_processing_utils import select_best_resolution
30
+ from transformers.modeling_outputs import ModelOutput
31
+ from transformers.modeling_utils import PreTrainedModel
32
+ from transformers.utils import (
33
+ add_start_docstrings,
34
+ add_start_docstrings_to_model_forward,
35
+ logging,
36
+ replace_return_docstrings,
37
+ )
38
+ from transformers.models.auto import AutoModel, AutoModelForCausalLM
39
+ from transformers.models.llava_next.configuration_llava_next import LlavaNextConfig
40
+
41
+
42
+ logger = logging.get_logger(__name__)
43
+
44
+ _CONFIG_FOR_DOC = "LlavaNextConfig"
45
+ from pathlib import Path
46
+
47
+ def save_list_to_incremental_file(data_list, save_dir="/common/home/users/w/wzhao/vqclip/llava_next_tensors"):
48
+ """
49
+ 将列表保存到指定目录,文件名按数字递增
50
+
51
+ Args:
52
+ data_list: 要保存的列表数据
53
+ save_dir: 保存目录路径
54
+
55
+ Returns:
56
+ 保存的文件路径
57
+ """
58
+ # 确保目录存在
59
+ save_dir = Path(save_dir)
60
+ save_dir.mkdir(parents=True, exist_ok=True)
61
+
62
+ # 查找可用的文件名
63
+ index = 1
64
+ while True:
65
+ file_path = save_dir / f"{index}.npy"
66
+ if not file_path.exists():
67
+ break
68
+ index += 1
69
+
70
+ # 将列表转换为numpy数组并保存
71
+ np_array = np.array(data_list)
72
+ np.save(str(file_path), np_array)
73
+
74
+ return file_path
75
+
76
+ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
77
+ """
78
+ Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
79
+
80
+ Args:
81
+ image_size (`tuple`):
82
+ The size of the input image in the format (width, height).
83
+ grid_pinpoints (`List`):
84
+ A list containing possible resolutions. Each item in the list should be a tuple or list
85
+ of the form `(height, width)`.
86
+ patch_size (`int`):
87
+ The size of each image patch.
88
+
89
+ Returns:
90
+ tuple: The shape of the image patch grid in the format (width, height).
91
+ """
92
+ if not isinstance(grid_pinpoints, list):
93
+ raise TypeError("grid_pinpoints should be a list of tuples or lists")
94
+
95
+ # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
96
+ if not isinstance(image_size, (list, tuple)):
97
+ if not isinstance(image_size, (torch.Tensor, np.ndarray)):
98
+ raise TypeError(
99
+ f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor"
100
+ )
101
+ image_size = image_size.tolist()
102
+
103
+ height, width = select_best_resolution(image_size, grid_pinpoints)
104
+ return height // patch_size, width // patch_size
105
+
106
+
107
+ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
108
+ """
109
+ Calculate the number of patches after the preprocessing for images of any resolution.
110
+
111
+ Args:
112
+ image_size (`torch.LongTensor` or `np.ndarray` or `Tuple[int, int]`):
113
+ The size of the input image in the format (height, width). ?
114
+ grid_pinpoints (`List`):
115
+ A list containing possible resolutions. Each item in the list should be a tuple or list
116
+ of the form `(height, width)`.
117
+ patch_size (`int`):
118
+ The size of each image patch.
119
+
120
+ Returns:
121
+ int: the number of patches
122
+ """
123
+ if not isinstance(grid_pinpoints, list):
124
+ raise TypeError("grid_pinpoints should be a list of tuples or lists")
125
+
126
+ # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
127
+ if not isinstance(image_size, (list, tuple)):
128
+ if not isinstance(image_size, (torch.Tensor, np.ndarray)):
129
+ raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}")
130
+ image_size = image_size.tolist()
131
+
132
+ best_resolution = select_best_resolution(image_size, grid_pinpoints)
133
+ height, width = best_resolution
134
+ num_patches = 0
135
+ # consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1
136
+ for i in range(0, height, patch_size):
137
+ for j in range(0, width, patch_size):
138
+ num_patches += 1
139
+ # add the base patch
140
+ num_patches += 1
141
+ return num_patches
142
+
143
+
144
+ def unpad_image(tensor, original_size):
145
+ """
146
+ Unpads a PyTorch tensor of a padded and resized image.
147
+
148
+ Args:
149
+ tensor (`torch.Tensor`):
150
+ The image tensor, assumed to be of shape (num_channels, height, width).
151
+ original_size (`tuple`):
152
+ The original size of the image (height, width).
153
+
154
+ Returns:
155
+ `torch.Tensor`: The unpadded image tensor.
156
+ """
157
+ if not isinstance(original_size, (list, tuple)):
158
+ if not isinstance(original_size, (torch.Tensor, np.ndarray)):
159
+ raise TypeError(
160
+ f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor"
161
+ )
162
+ original_size = original_size.tolist()
163
+ original_height, original_width = original_size
164
+ current_height, current_width = tensor.shape[1:]
165
+
166
+ original_aspect_ratio = original_width / original_height
167
+ current_aspect_ratio = current_width / current_height
168
+
169
+ if original_aspect_ratio > current_aspect_ratio:
170
+ scale_factor = current_width / original_width
171
+ new_height = int(original_height * scale_factor)
172
+ padding = (current_height - new_height) // 2
173
+ unpadded_tensor = tensor[:, padding : current_height - padding, :]
174
+ else:
175
+ scale_factor = current_height / original_height
176
+ new_width = int(original_width * scale_factor)
177
+ padding = (current_width - new_width) // 2
178
+ unpadded_tensor = tensor[:, :, padding : current_width - padding]
179
+
180
+ return unpadded_tensor
181
+
182
+
183
+ @dataclass
184
+ class LlavaNextCausalLMOutputWithPast(ModelOutput):
185
+ """
186
+ Base class for LlavaNext causal language model (or autoregressive) outputs.
187
+
188
+ Args:
189
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
190
+ Language modeling loss (for next-token prediction).
191
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
192
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
193
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
194
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
195
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
196
+
197
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
198
+ `past_key_values` input) to speed up sequential decoding.
199
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
200
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
201
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
202
+
203
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
204
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
205
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
206
+ sequence_length)`.
207
+
208
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
209
+ heads.
210
+ image_hidden_states (`torch.FloatTensor`, *optional*):
211
+ A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
212
+ image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
213
+ """
214
+
215
+ loss: Optional[torch.FloatTensor] = None
216
+ logits: torch.FloatTensor = None
217
+ past_key_values: Optional[List[torch.FloatTensor]] = None
218
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
219
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
220
+ image_hidden_states: Optional[torch.FloatTensor] = None
221
+
222
+ class VectorQuantizer(nn.Module):
223
+ def __init__(self, num_embeddings: int, embedding_dim: int, commitment_cost: float = 0.25):
224
+ super().__init__()
225
+ self.num_embeddings = num_embeddings
226
+ self.embedding_dim = embedding_dim
227
+ self.commitment_cost = commitment_cost
228
+
229
+ # Embedding table
230
+ self.embedding = nn.Embedding(num_embeddings, embedding_dim)
231
+ self.embedding.weight.data.uniform_(-1/num_embeddings, 1/num_embeddings)
232
+
233
+ def forward(self, inputs):
234
+
235
+ self.embedding.weight.data = self.embedding.weight.data.to(dtype=inputs.dtype)
236
+ # Convert inputs from BCHW -> BHWC
237
+ inputs = inputs.permute(0, 2, 1).contiguous()
238
+ input_shape = inputs.shape
239
+
240
+ # Flatten input
241
+ flat_input = inputs.view(-1, self.embedding_dim)
242
+
243
+ # Calculate distances
244
+ distances = (torch.sum(flat_input**2, dim=1, keepdim=True)
245
+ + torch.sum(self.embedding.weight**2, dim=1)
246
+ - 2 * torch.matmul(flat_input, self.embedding.weight.t()))
247
+
248
+ # Encoding
249
+ encoding_indices = torch.argmin(distances, dim=1).unsqueeze(1)
250
+ encodings = torch.zeros(encoding_indices.shape[0], self.num_embeddings, device=inputs.device).to(inputs.dtype)
251
+ encodings.scatter_(1, encoding_indices, 1)
252
+ #self.embedding.weight = self.embedding.weight.to(input_type)
253
+ # Quantize and unflatten
254
+ #print(inputs.dtype)
255
+ #print(self.embedding.weight.dtype)
256
+ quantized = torch.matmul(encodings, self.embedding.weight).view(input_shape)
257
+
258
+ # Loss
259
+ e_latent_loss = torch.mean((quantized.detach() - inputs)**2)
260
+ q_latent_loss = torch.mean((quantized - inputs.detach())**2)
261
+ loss = q_latent_loss + self.commitment_cost * e_latent_loss
262
+ print("this is q_latent_loss", q_latent_loss)
263
+ print("This is e_latent_loss", self.commitment_cost * e_latent_loss)
264
+ quantized = inputs + (quantized - inputs).detach()
265
+ avg_probs = torch.mean(encodings, dim=0)
266
+ perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
267
+
268
+ # Convert quantized from BHWC -> BCHW
269
+ return quantized.permute(0, 2, 1).contiguous(), loss, perplexity
270
+
271
+ # Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaNext
272
+ class LlavaNextMultiModalProjector(nn.Module):
273
+ def __init__(self, config: LlavaNextConfig):
274
+ super().__init__()
275
+
276
+ self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
277
+ self.act = ACT2FN[config.projector_hidden_act]
278
+ self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
279
+ self.vq = VectorQuantizer(
280
+ num_embeddings=16000, # codebook size
281
+ embedding_dim=config.text_config.hidden_size, # dimension of each embedding vector
282
+ commitment_cost=0.5
283
+ )
284
+ self.vq_cls = VectorQuantizerCLS(
285
+ num_embeddings=128,
286
+ embedding_dim=4096,
287
+ commitment_cost=0.25,
288
+ use_cosine=True
289
+ )
290
+ def forward(self, image_features):
291
+ cls_features = image_features[: , :1]
292
+ cls_features = self.linear_1(cls_features)
293
+ cls_features = self.act(cls_features)
294
+ cls_features = self.linear_2(cls_features)
295
+ cls_features = cls_features[:, 0:]
296
+ cls_features = cls_features.mean(dim=0, keepdim=True).squeeze(0)
297
+ #save_list_to_incremental_file(cls_features.cpu().detach().numpy())
298
+ quantized, loss, perplexity, indices = self.vq_cls(cls_features)
299
+ categories = self.vq_cls.get_category_from_index(indices)
300
+ indices = indices.cpu().numpy()
301
+ print(indices)
302
+ print(categories)
303
+ if categories[0] != 0:
304
+ raise ValueError([indices, categories[0]])
305
+ #save_list_to_incremental_file(save_list)
306
+ # tensor(54)
307
+ # ['porn']
308
+ image_features = image_features[: , 1:]
309
+ hidden_states = self.linear_1(image_features)
310
+ hidden_states = self.act(hidden_states)
311
+ hidden_states = self.linear_2(hidden_states)
312
+
313
+ quantized_features, vq_loss, perplexity = self.vq(hidden_states)
314
+ print(quantized_features.shape)
315
+ return quantized_features, vq_loss
316
+
317
+
318
+ LLAVA_NEXT_START_DOCSTRING = r"""
319
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
320
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
321
+ etc.)
322
+
323
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
324
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
325
+ and behavior.
326
+
327
+ Parameters:
328
+ config ([`LlavaNextConfig`] or [`LlavaNextVisionConfig`]):
329
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
330
+ load the weights associated with the model, only the configuration. Check out the
331
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
332
+ """
333
+ class VectorQuantizerCLS(nn.Module):
334
+ def __init__(self, num_embeddings: int = 64, embedding_dim: int = 4096, commitment_cost: float = 0.25,
335
+ codebook_path: str = None, mapping_path: str = None, use_cosine: bool = True,
336
+ randomize_indices: bool = True):
337
+ super().__init__()
338
+ self.num_embeddings = num_embeddings
339
+ self.embedding_dim = embedding_dim
340
+ self.commitment_cost = commitment_cost
341
+ self.use_cosine = use_cosine
342
+
343
+ # Embedding table
344
+ self.embedding = nn.Embedding(num_embeddings, embedding_dim)
345
+ self.embedding.weight.data.uniform_(-1/num_embeddings, 1/num_embeddings)
346
+
347
+ # 初始化合适大小的buffer,避免加载时大小不匹配
348
+ self.register_buffer('_category_mapping_indices', torch.zeros(num_embeddings, dtype=torch.long))
349
+ self.register_buffer('_category_mapping_names', torch.zeros(num_embeddings, dtype=torch.long))
350
+
351
+ # 非持久化属性
352
+ self.center_to_category = None
353
+
354
+ # 加载预先计算的codebook
355
+ if codebook_path is not None and mapping_path is not None:
356
+ self.load_codebook(codebook_path, mapping_path, randomize_indices)
357
+
358
+ def load_codebook(self, codebook_path, mapping_path, randomize_indices=True):
359
+ """加载预计算的codebook和类别映射,并可选择随机化索引"""
360
+ try:
361
+ # 加载codebook
362
+ print(f"Loading codebook from {codebook_path}")
363
+ centers = np.load(codebook_path)
364
+ print(f"Loaded codebook with shape: {centers.shape}")
365
+
366
+ # 加载类别映射
367
+ print(f"Loading category mappings from {mapping_path}")
368
+ with open(mapping_path, 'rb') as f:
369
+ mappings = pickle.load(f)
370
+
371
+ # 将文本类别映射为数字
372
+ category_mapping_text = mappings['category_mapping']
373
+ classes = {'neutral':0, 'porn':1, 'gun':2, 'cigarette':3, 'alcohol':4, 'knife':5, 'blood':6, 'insulting_gesture':7}
374
+
375
+ # 转换为数字映射
376
+ center_category_mapping = {}
377
+ for i, category_text in enumerate(category_mapping_text):
378
+ center_category_mapping[i] = classes.get(category_text, 0) # 默认为neutral(0)
379
+
380
+ print(f"Loaded {len(center_category_mapping)} category mappings")
381
+
382
+ # 准备数据
383
+ actual_centers = centers.shape[0]
384
+ print(f"Actual centers: {actual_centers}")
385
+
386
+ # 更新num_embeddings为实际中心点数量
387
+ self.num_embeddings = actual_centers
388
+ print(f"Setting num_embeddings to {self.num_embeddings}")
389
+
390
+ # 如果需要随机化索引,创建随机排列
391
+ if randomize_indices:
392
+ print("Randomizing codebook indices to prevent category clustering")
393
+ # 创建随机排列
394
+ permutation = list(range(actual_centers))
395
+ random.shuffle(permutation)
396
+ inverse_permutation = {v: k for k, v in enumerate(permutation)}
397
+
398
+ # 应用随机排列到中心点和类别映射
399
+ permuted_centers = np.zeros_like(centers)
400
+ permuted_categories = {}
401
+
402
+ for new_idx, old_idx in enumerate(permutation):
403
+ permuted_centers[new_idx] = centers[old_idx]
404
+ if old_idx < len(center_category_mapping):
405
+ permuted_categories[new_idx] = center_category_mapping[old_idx]
406
+
407
+ # 使用随机化后的数据
408
+ centers = permuted_centers
409
+ self.center_to_category = permuted_categories
410
+
411
+ # 打印一些随机化后的映射示例
412
+ print("Sample randomized mappings:")
413
+ for i in range(min(5, len(self.center_to_category))):
414
+ print(f" New index {i}: {self.center_to_category[i]}")
415
+ else:
416
+ # 不随机化,直接使用原始映射
417
+ self.center_to_category = {i: center_category_mapping[i]
418
+ for i in range(min(actual_centers, len(center_category_mapping)))}
419
+
420
+ # 验证类别映射是否完整
421
+ for i in range(self.num_embeddings):
422
+ if i not in self.center_to_category:
423
+ print(f"Warning: No category mapping for center {i}, setting to 0")
424
+ self.center_to_category[i] = 0 # 用0代替"unknown"
425
+
426
+ # 创建embedding数据并更新
427
+ embedding_data = torch.tensor(centers, dtype=torch.float32)
428
+
429
+ # 重新创建embedding层以匹配实际大小
430
+ self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
431
+ self.embedding.weight.data.copy_(embedding_data)
432
+
433
+ # 重新注册buffer以匹配新的大小
434
+ self.register_buffer('_category_mapping_indices', torch.zeros(self.num_embeddings, dtype=torch.long))
435
+ self.register_buffer('_category_mapping_names', torch.zeros(self.num_embeddings, dtype=torch.long))
436
+
437
+ # 将类别映射存储到buffer中(用于state_dict)
438
+ self._store_category_mapping()
439
+
440
+ print(f"Successfully loaded codebook with {self.num_embeddings} entries")
441
+
442
+ # 分析类别分布
443
+ category_counts = {}
444
+ for category in self.center_to_category.values():
445
+ if category in category_counts:
446
+ category_counts[category] += 1
447
+ else:
448
+ category_counts[category] = 1
449
+
450
+ print("Category distribution in codebook:")
451
+ for category, count in sorted(category_counts.items()):
452
+ print(f" {category}: {count} centers")
453
+
454
+ return True
455
+
456
+ except Exception as e:
457
+ print(f"Error loading codebook: {e}")
458
+ import traceback
459
+ traceback.print_exc()
460
+ print("Using random initialization instead")
461
+ return False
462
+
463
+ def _store_category_mapping(self):
464
+ """将类别映射存储到模型的buffer中,以便在state_dict中保存"""
465
+ if not self.center_to_category:
466
+ warnings.warn("No category mapping to store")
467
+ return
468
+
469
+ # 获取所有类别ID
470
+ all_categories = sorted(set(self.center_to_category.values()))
471
+
472
+ # 创建索引和对应类别ID的映射
473
+ indices = list(self.center_to_category.keys())
474
+ category_ids = [self.center_to_category[idx] for idx in indices]
475
+
476
+ # 确保indices数组长度与buffer大小一致
477
+ if len(indices) != self._category_mapping_indices.size(0):
478
+ # 重新注册buffer以匹配大小
479
+ self.register_buffer('_category_mapping_indices', torch.zeros(len(indices), dtype=torch.long))
480
+ self.register_buffer('_category_mapping_names', torch.zeros(len(indices), dtype=torch.long))
481
+
482
+ # 存储到buffer中
483
+ self._category_mapping_indices.copy_(torch.tensor(indices, dtype=torch.long))
484
+ self._category_mapping_names.copy_(torch.tensor(category_ids, dtype=torch.long))
485
+
486
+ print(f"Stored category mapping with {len(indices)} entries and {len(all_categories)} unique categories")
487
+
488
+ def _load_category_mapping(self):
489
+ """从模型的buffer恢复类别映射"""
490
+ if not hasattr(self, '_category_mapping_indices') or self._category_mapping_indices.numel() == 0:
491
+ warnings.warn("No stored category mapping found")
492
+ return {}
493
+
494
+ # 重建类别映射字典
495
+ indices = self._category_mapping_indices.tolist()
496
+ category_ids = self._category_mapping_names.tolist()
497
+
498
+ mapping = {}
499
+ for idx, cat_id in zip(indices, category_ids):
500
+ mapping[idx] = cat_id
501
+
502
+ return mapping
503
+
504
+ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
505
+ """自定义state_dict加载方法,处理buffer大小不匹配的问题"""
506
+ # 检查并调整buffer大小,以匹配加载的state_dict
507
+ indices_key = prefix + '_category_mapping_indices'
508
+ names_key = prefix + '_category_mapping_names'
509
+
510
+ if indices_key in state_dict and names_key in state_dict:
511
+ indices_size = state_dict[indices_key].size()
512
+ names_size = state_dict[names_key].size()
513
+
514
+ # 重新注册buffer以匹配加载的大小
515
+ if hasattr(self, '_category_mapping_indices') and self._category_mapping_indices.size() != indices_size:
516
+ self.register_buffer('_category_mapping_indices', torch.zeros(indices_size, dtype=torch.long))
517
+
518
+ if hasattr(self, '_category_mapping_names') and self._category_mapping_names.size() != names_size:
519
+ self.register_buffer('_category_mapping_names', torch.zeros(names_size, dtype=torch.long))
520
+
521
+ # 调用父类方法加载常规参数
522
+ super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
523
+
524
+ # 在加载完成后重建类别映射
525
+ self.center_to_category = self._load_category_mapping()
526
+
527
+ # 更新num_embeddings以匹配加载的模型
528
+ if hasattr(self, 'embedding') and hasattr(self.embedding, 'weight'):
529
+ self.num_embeddings = self.embedding.weight.size(0)
530
+
531
+ def forward(self, inputs):
532
+ """
533
+ 前向传播,专门处理(1, 4096)形状的输入
534
+
535
+ Args:
536
+ inputs: 形状为(1, 4096)的特征向量
537
+
538
+ Returns:
539
+ quantized: 量化后的特征向量
540
+ loss: 承诺损失
541
+ perplexity: 困惑度
542
+ encoding_indices: 编码索引
543
+ """
544
+ # 验证输入形状
545
+ if inputs.shape != (1, 4096):
546
+ raise ValueError(f"Expected input shape (1, 4096), got {inputs.shape}")
547
+
548
+ # 确保embedding权重与输入使用相同的数据类型
549
+ self.embedding.weight.data = self.embedding.weight.data.to(dtype=inputs.dtype)
550
+
551
+ # 直接使用输入,不需要形状转换
552
+ flat_input = inputs
553
+
554
+ # 计算与codebook中各向量的距离
555
+ if self.use_cosine:
556
+ # 归一化向量进行余弦相似度计算
557
+ normalized_input = F.normalize(flat_input, p=2, dim=1)
558
+ normalized_weights = F.normalize(self.embedding.weight, p=2, dim=1)
559
+
560
+ # 计算余弦相似度
561
+ cosine_sim = torch.matmul(normalized_input, normalized_weights.t())
562
+
563
+ # 将相似度转换为距离(最大相似度对应最小距离)
564
+ distances = 1 - cosine_sim
565
+ else:
566
+ # 使用欧氏距离
567
+ distances = (torch.sum(flat_input**2, dim=1, keepdim=True)
568
+ + torch.sum(self.embedding.weight**2, dim=1)
569
+ - 2 * torch.matmul(flat_input, self.embedding.weight.t()))
570
+
571
+ # 找到最近的编码向量索引
572
+ encoding_indices = torch.argmin(distances, dim=1).unsqueeze(1)
573
+
574
+ # 创建one-hot编码
575
+ encodings = torch.zeros(encoding_indices.shape[0], self.num_embeddings, device=inputs.device).to(inputs.dtype)
576
+ encodings.scatter_(1, encoding_indices, 1)
577
+
578
+ # 量化
579
+ quantized = torch.matmul(encodings, self.embedding.weight)
580
+
581
+ # 计算损失
582
+ e_latent_loss = torch.mean((quantized.detach() - flat_input)**2)
583
+ q_latent_loss = torch.mean((quantized - flat_input.detach())**2)
584
+ loss = q_latent_loss + self.commitment_cost * e_latent_loss
585
+
586
+ print("this is q_latent_loss", q_latent_loss)
587
+ print("This is e_latent_loss", self.commitment_cost * e_latent_loss)
588
+
589
+ # Straight-through estimator
590
+ quantized = flat_input + (quantized - flat_input).detach()
591
+
592
+ # 计算perplexity
593
+ avg_probs = torch.mean(encodings, dim=0)
594
+ perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
595
+
596
+ # 返回量化后的向量、损失、困惑度和索引
597
+ return quantized, loss, perplexity, encoding_indices.squeeze()
598
+
599
+ def encode(self, inputs):
600
+ """
601
+ 仅执行编码过程,返回索引
602
+
603
+ Args:
604
+ inputs: 形状为(1, 4096)的特征向量
605
+
606
+ Returns:
607
+ 编码索引
608
+ """
609
+ # 验证输入形状
610
+ if inputs.shape != (1, 4096):
611
+ raise ValueError(f"Expected input shape (1, 4096), got {inputs.shape}")
612
+
613
+ with torch.no_grad():
614
+ # 计算距离
615
+ if self.use_cosine:
616
+ normalized_input = F.normalize(inputs, p=2, dim=1)
617
+ normalized_weights = F.normalize(self.embedding.weight, p=2, dim=1)
618
+ cosine_sim = torch.matmul(normalized_input, normalized_weights.t())
619
+ distances = 1 - cosine_sim
620
+ else:
621
+ distances = (torch.sum(inputs**2, dim=1, keepdim=True)
622
+ + torch.sum(self.embedding.weight**2, dim=1)
623
+ - 2 * torch.matmul(inputs, self.embedding.weight.t()))
624
+
625
+ # 找到最近的编码向量索引
626
+ encoding_indices = torch.argmin(distances, dim=1)
627
+
628
+ return encoding_indices
629
+
630
+ def get_category_from_index(self, indices):
631
+ """
632
+ 根据索引获取对应的类别编号
633
+
634
+ Args:
635
+ indices: 编码索引
636
+
637
+ Returns:
638
+ 类别编号列表
639
+ """
640
+ # 如果没有类别映射,尝试从buffer恢复
641
+ if self.center_to_category is None:
642
+ self.center_to_category = self._load_category_mapping()
643
+
644
+ if not self.center_to_category:
645
+ return [0] * indices.numel() # 使用0(neutral)代替"unknown"
646
+
647
+ # 将索引张量转为NumPy数组
648
+ indices_np = indices.cpu().numpy().flatten()
649
+
650
+ # 获取类别
651
+ categories = []
652
+ for idx in indices_np:
653
+ idx_int = int(idx)
654
+ category = self.center_to_category.get(idx_int, 0) # 默认为0(neutral)
655
+ categories.append(category)
656
+
657
+ return categories
658
+
659
+ def classify(self, inputs):
660
+ """
661
+ 对输入特征进行分类,返回类别编号和索引
662
+
663
+ Args:
664
+ inputs: 形状为(1, 4096)的特征向量
665
+
666
+ Returns:
667
+ categories: 预测的类别编号
668
+ indices: 编码索引
669
+ """
670
+ # 验证输入形状
671
+ if inputs.shape != (1, 4096):
672
+ raise ValueError(f"Expected input shape (1, 4096), got {inputs.shape}")
673
+
674
+ indices = self.encode(inputs)
675
+ categories = self.get_category_from_index(indices)
676
+ return categories, indices
677
+
678
+
679
+ @add_start_docstrings(
680
+ "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
681
+ LLAVA_NEXT_START_DOCSTRING,
682
+ )
683
+ # Copied from transformers.models.llava.modeling_llava.LlavaPreTrainedModel with Llava->LlavaNext,llava->llava_next
684
+ class LlavaNextPreTrainedModel(PreTrainedModel):
685
+ config_class = LlavaNextConfig
686
+ base_model_prefix = "model"
687
+ supports_gradient_checkpointing = True
688
+ _no_split_modules = ["LlavaNextVisionAttention"]
689
+ _skip_keys_device_placement = "past_key_values"
690
+ _supports_cache_class = True
691
+ _supports_flash_attn_2 = True
692
+ _supports_sdpa = True
693
+
694
+ def _init_weights(self, module):
695
+ # important: this ported version of LlavaNext isn't meant for training from scratch - only
696
+ # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
697
+ # https://github.com/haotian-liu/LLaVA/tree/main/llava_next should serve for that purpose
698
+ std = (
699
+ self.config.initializer_range
700
+ if hasattr(self.config, "initializer_range")
701
+ else self.config.text_config.initializer_range
702
+ )
703
+
704
+ if hasattr(module, "class_embedding"):
705
+ module.class_embedding.data.normal_(mean=0.0, std=std)
706
+
707
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
708
+ module.weight.data.normal_(mean=0.0, std=std)
709
+ if module.bias is not None:
710
+ module.bias.data.zero_()
711
+ elif isinstance(module, nn.Embedding):
712
+ module.weight.data.normal_(mean=0.0, std=std)
713
+ if module.padding_idx is not None:
714
+ module.weight.data[module.padding_idx].zero_()
715
+
716
+
717
+ LLAVA_NEXT_INPUTS_DOCSTRING = r"""
718
+ Args:
719
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
720
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
721
+ it.
722
+
723
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
724
+ [`PreTrainedTokenizer.__call__`] for details.
725
+
726
+ [What are input IDs?](../glossary#input-ids)
727
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
728
+ The tensors corresponding to the input images. Pixel values can be obtained using
729
+ [`AutoImageProcessor`]. See [`LlavaNextImageProcessor.__call__`] for details. [`LlavaProcessor`] uses
730
+ [`LlavaNextImageProcessor`] for processing images.
731
+ image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`, *optional*):
732
+ The sizes of the images in the batch, being (height, width) for each image.
733
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
734
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
735
+
736
+ - 1 for tokens that are **not masked**,
737
+ - 0 for tokens that are **masked**.
738
+
739
+ [What are attention masks?](../glossary#attention-mask)
740
+
741
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
742
+ [`PreTrainedTokenizer.__call__`] for details.
743
+
744
+ If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
745
+ `past_key_values`).
746
+
747
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
748
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
749
+ information on the default strategy.
750
+
751
+ - 1 indicates the head is **not masked**,
752
+ - 0 indicates the head is **masked**.
753
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
754
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
755
+ config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
756
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
757
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
758
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
759
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
760
+
761
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
762
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
763
+
764
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
765
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
766
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
767
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
768
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
769
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
770
+ model's internal embedding lookup matrix.
771
+ vision_feature_layer (`int`, *optional*, defaults to -2):
772
+ The index of the layer to select the vision feature.
773
+ vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
774
+ The feature selection strategy used to select the vision feature from the vision backbone.
775
+ Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
776
+ If `"full"`, the full vision features are used.
777
+ use_cache (`bool`, *optional*):
778
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
779
+ `past_key_values`).
780
+ output_attentions (`bool`, *optional*):
781
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
782
+ tensors for more detail.
783
+ output_hidden_states (`bool`, *optional*):
784
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
785
+ more detail.
786
+ return_dict (`bool`, *optional*):
787
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
788
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
789
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
790
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
791
+ the complete sequence length.
792
+ """
793
+
794
+
795
+ @add_start_docstrings(
796
+ """The LLAVA-NeXT model which consists of a vision backbone and a language model.""",
797
+ LLAVA_NEXT_START_DOCSTRING,
798
+ )
799
+ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixin):
800
+ def __init__(self, config: LlavaNextConfig):
801
+ super().__init__(config)
802
+ self.vision_tower = AutoModel.from_config(config.vision_config)
803
+
804
+ self.multi_modal_projector = LlavaNextMultiModalProjector(config)
805
+ embed_std = 1 / math.sqrt(config.text_config.hidden_size)
806
+ self.image_newline = nn.Parameter(torch.randn(config.text_config.hidden_size, dtype=self.dtype) * embed_std)
807
+
808
+ self.vocab_size = config.text_config.vocab_size
809
+ self.language_model = AutoModelForCausalLM.from_config(config.text_config)
810
+ self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
811
+ self._padding_side = "left" # set it to left by default, user can use setter to change padding_sides
812
+ self.post_init()
813
+
814
+ @property
815
+ def padding_side(self):
816
+ return self._padding_side
817
+
818
+ @padding_side.setter
819
+ def padding_side(self, padding_side: str):
820
+ if padding_side not in ["left", "right"]:
821
+ raise ValueError(f"{padding_side} is not `left` or `right`.")
822
+ self._padding_side = padding_side
823
+
824
+ # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_input_embeddings
825
+ def get_input_embeddings(self):
826
+ return self.language_model.get_input_embeddings()
827
+
828
+ # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_input_embeddings
829
+ def set_input_embeddings(self, value):
830
+ self.language_model.set_input_embeddings(value)
831
+
832
+ # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_output_embeddings
833
+ def get_output_embeddings(self):
834
+ return self.language_model.get_output_embeddings()
835
+
836
+ # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_output_embeddings
837
+ def set_output_embeddings(self, new_embeddings):
838
+ self.language_model.set_output_embeddings(new_embeddings)
839
+
840
+ # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.set_decoder
841
+ def set_decoder(self, decoder):
842
+ self.language_model.set_decoder(decoder)
843
+
844
+ # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.get_decoder
845
+ def get_decoder(self):
846
+ return self.language_model.get_decoder()
847
+
848
+ # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.tie_weights
849
+ def tie_weights(self):
850
+ return self.language_model.tie_weights()
851
+
852
+ # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration.resize_token_embeddings
853
+ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
854
+ model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
855
+ # update vocab size
856
+ self.config.text_config.vocab_size = model_embeds.num_embeddings
857
+ self.vocab_size = model_embeds.num_embeddings
858
+ return model_embeds
859
+
860
+ def _merge_input_ids_with_image_features(
861
+ self,
862
+ image_features,
863
+ feature_lens,
864
+ inputs_embeds,
865
+ input_ids,
866
+ attention_mask,
867
+ position_ids=None,
868
+ labels=None,
869
+ image_token_index=None,
870
+ ignore_index=-100,
871
+ ):
872
+ """
873
+ Merge input_ids with with image features into final embeddings
874
+
875
+ Args:
876
+ image_features (`torch.Tensor` of shape `(all_feature_lens, embed_dim)`):
877
+ All vision vectors of all images in the batch
878
+ feature_lens (`torch.LongTensor` of shape `(num_images)`):
879
+ The length of visual embeddings of each image as stacked in `image_features`
880
+ inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, embed_dim)`):
881
+ Token embeddings before merging with visual embeddings
882
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
883
+ Input_ids of tokens, possibly filled with image token
884
+ attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
885
+ Mask to avoid performing attention on padding token indices.
886
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
887
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
888
+ config.n_positions - 1]`.
889
+ labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*)
890
+ :abels need to be recalculated to support training (if provided)
891
+ image_token_index (`int`, *optional*)
892
+ Token id used to indicate the special "image" token. Defaults to `config.image_token_index`
893
+ ignore_index (`int`, *optional*)
894
+ Value that is used to pad `labels` and will be ignored when calculated loss. Default: -100.
895
+ Returns:
896
+ final_embedding, final_attention_mask, position_ids, final_labels
897
+
898
+ Explanation:
899
+ each image has variable length embeddings, with length specified by feature_lens
900
+ image_features is concatenation of all visual embed vectors
901
+ task: fill each <image> with the correct number of visual embeddings
902
+ Example:
903
+ X (5 patches), Y (3 patches), Z (8)
904
+ X, Y are in the same sequence (in-context learning)
905
+ if right padding
906
+ input_ids: [
907
+ a b c d e f X g h i j k Y l m
908
+ o p q r Z s t u v _ _ _ _ _ _
909
+ ]
910
+ input_ids should be: [
911
+ a b c d e f X X X X X g h i j k Y Y Y l m
912
+ o p q r Z Z Z Z Z Z Z Z s t u v _ _ _ _ _
913
+ ]
914
+ labels should be: [
915
+ a b c d e f _ _ _ _ _ g h i j k _ _ _ l m
916
+ o p q r _ _ _ _ _ _ _ _ s t u v _ _ _ _ _
917
+ ]
918
+ elif left padding
919
+ input_ids: [
920
+ a b c d e f X g h i j k Y l m
921
+ _ _ _ _ _ _ o p q r Z s t u v
922
+ ]
923
+ input_ids should be: [
924
+ a b c d e f X X X X X g h i j k Y Y Y l m
925
+ _ _ _ _ _ o p q r Z Z Z Z Z Z Z Z s t u v
926
+ ]
927
+ labels should be: [
928
+ a b c d e f _ _ _ _ _ g h i j k _ _ _ l m
929
+ _ _ _ _ _ o p q r _ _ _ _ _ _ _ _ s t u v
930
+ ]
931
+ Edge cases:
932
+ * If tokens are same but image token sizes are different, then cannot infer left or right padding
933
+ ```python
934
+ cat_img = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
935
+ chart_img = Image.open(requests.get("https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true", stream=True).raw)
936
+ prompts = [
937
+ "[INST] <image>\nWhat is shown in this image? [/INST]",
938
+ "[INST] <image>\nWhat is shown in this image? [/INST]",
939
+ ]
940
+ inputs = processor(prompts, [chart_img, cat_img], return_tensors='pt', padding=True).to("cuda")
941
+ chart_img has 2634 tokens, while cat_img has 2340 tokens
942
+ ```
943
+
944
+ input_ids: [
945
+ a b c d X g h
946
+ i j Y k l m n
947
+ ]
948
+ where X is 3 tokens while Y is 5, this mean after merge
949
+ if left-padding (batched generation)
950
+ input_ids should be: [
951
+ _ _ a b c d X X X g h
952
+ i j Y Y Y Y Y k l m n
953
+ ]
954
+ elif (right padding) (training)
955
+ input_ids should be: [
956
+ a b c d X X X g h _ _
957
+ i j Y Y Y Y Y k l m n
958
+ ]
959
+ """
960
+ image_token_index = image_token_index if image_token_index is not None else self.config.image_token_index
961
+ ignore_index = ignore_index if ignore_index is not None else self.config.ignore_index
962
+
963
+ if self.training and self.padding_side == "left":
964
+ logger.warning_once(
965
+ "Padding side is set to 'left' but the model is in training mode. For training "
966
+ "it is recommended to set `model.padding_side='right' and `processor.tokenizer.padding_side='right'`. "
967
+ "If that's intended, ignore this warning"
968
+ )
969
+ if not self.training and self.padding_side == "right":
970
+ logger.warning_once(
971
+ "Padding side is set to 'right' but the model is in inference mode. For correct "
972
+ "generation results, please set `model.padding_side='left'` and `processor.tokenizer.padding_side='left'`. "
973
+ "If that's intended, ignore this warning"
974
+ )
975
+
976
+ with torch.no_grad():
977
+ # ! in llava 1.6, number of patches is variable
978
+ num_images = feature_lens.size(0)
979
+ num_image_features, embed_dim = image_features.shape
980
+ if feature_lens.sum() != num_image_features:
981
+ raise ValueError(f"{feature_lens=} / {feature_lens.sum()} != {image_features.shape=}")
982
+ batch_size = input_ids.shape[0]
983
+ _left_padding = torch.any(attention_mask[:, 0] == 0)
984
+ _right_padding = torch.any(attention_mask[:, -1] == 0)
985
+
986
+ left_padding = self.padding_side == "left"
987
+ if batch_size > 1:
988
+ if _left_padding and _right_padding:
989
+ raise ValueError(f"both side of attention_mask has zero, invalid. {attention_mask}")
990
+ elif _right_padding and left_padding:
991
+ left_padding = False
992
+ elif _left_padding and not left_padding:
993
+ left_padding = True
994
+
995
+ # Whether to turn off right padding
996
+ # 1. Create a mask to know where special image tokens are
997
+ special_image_token_mask = input_ids == image_token_index
998
+ # special_image_token_mask: [bsz, seqlen]
999
+ num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
1000
+ # num_special_image_tokens: [bsz]
1001
+ # Reserve for padding of num_images
1002
+ total_num_special_image_tokens = torch.sum(special_image_token_mask)
1003
+ if total_num_special_image_tokens != num_images:
1004
+ raise ValueError(
1005
+ f"Number of image tokens in input_ids ({total_num_special_image_tokens}) different from num_images ({num_images})."
1006
+ )
1007
+ # Compute the maximum embed dimension
1008
+ # max_image_feature_lens is max_feature_lens per batch
1009
+ feature_lens = feature_lens.to(input_ids.device)
1010
+ feature_lens_batch = feature_lens.split(num_special_image_tokens.tolist(), dim=0)
1011
+ feature_lens_batch_sum = torch.tensor([x.sum() for x in feature_lens_batch], device=input_ids.device)
1012
+ embed_sequence_lengths = (
1013
+ (attention_mask == 1).long().sum(-1) - num_special_image_tokens + feature_lens_batch_sum
1014
+ )
1015
+ max_embed_dim = embed_sequence_lengths.max()
1016
+
1017
+ batch_indices, non_image_indices = torch.where((input_ids != image_token_index) & (attention_mask == 1))
1018
+ # 2. Compute the positions where text should be written
1019
+ # Calculate new positions for text tokens in merged image-text sequence.
1020
+ # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images` text tokens.
1021
+ # `torch.cumsum` computes how each image token shifts subsequent text token positions.
1022
+ # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
1023
+ # ! instead of special_image_token_mask * (num_image_patches - 1)
1024
+ # special_image_token_mask * (num_feature_len - 1)
1025
+ special_image_token_mask = special_image_token_mask.long()
1026
+ special_image_token_mask[special_image_token_mask == 1] = feature_lens - 1
1027
+ new_token_positions = torch.cumsum((special_image_token_mask + 1), -1) - 1
1028
+ if left_padding:
1029
+ # shift right token positions so that they are ending at the same number
1030
+ # the below here was incorrect? new_token_positions += new_token_positions[:, -1].max() - new_token_positions[:, -1:]
1031
+ new_token_positions += max_embed_dim - 1 - new_token_positions[:, -1:]
1032
+
1033
+ text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
1034
+
1035
+ # 3. Create the full embedding, already padded to the maximum position
1036
+ final_embedding = torch.zeros(
1037
+ batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
1038
+ )
1039
+ final_attention_mask = torch.zeros(
1040
+ batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
1041
+ )
1042
+ final_input_ids = torch.full(
1043
+ (batch_size, max_embed_dim), self.pad_token_id, dtype=input_ids.dtype, device=inputs_embeds.device
1044
+ )
1045
+ # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
1046
+ # set the corresponding tensors into their correct target device.
1047
+ target_device = inputs_embeds.device
1048
+ batch_indices, non_image_indices, text_to_overwrite = (
1049
+ batch_indices.to(target_device),
1050
+ non_image_indices.to(target_device),
1051
+ text_to_overwrite.to(target_device),
1052
+ )
1053
+ attention_mask = attention_mask.to(target_device)
1054
+ input_ids = input_ids.to(target_device)
1055
+
1056
+ # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
1057
+ # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
1058
+ final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
1059
+ final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
1060
+ final_input_ids[batch_indices, text_to_overwrite] = input_ids[batch_indices, non_image_indices]
1061
+ final_labels = None
1062
+ if labels is not None:
1063
+ labels = labels.to(target_device)
1064
+ final_labels = torch.full_like(final_attention_mask, ignore_index).to(torch.long)
1065
+ final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
1066
+
1067
+ # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
1068
+ with torch.no_grad():
1069
+ image_to_overwrite = torch.full(
1070
+ (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
1071
+ )
1072
+ image_to_overwrite[batch_indices, text_to_overwrite] = False
1073
+ embed_indices = torch.arange(max_embed_dim).unsqueeze(0).to(target_device)
1074
+ embed_indices = embed_indices.expand(batch_size, max_embed_dim)
1075
+ embed_seq_lens = embed_sequence_lengths[:, None].to(target_device)
1076
+
1077
+ if left_padding:
1078
+ # exclude padding on the left
1079
+ max_embed_dim = max_embed_dim.to(target_device)
1080
+ val = (max_embed_dim - embed_indices) <= embed_seq_lens
1081
+ else:
1082
+ # exclude padding on the right
1083
+ val = embed_indices < embed_seq_lens
1084
+ image_to_overwrite &= val
1085
+
1086
+ if image_to_overwrite.sum() != num_image_features:
1087
+ raise ValueError(
1088
+ f"{image_to_overwrite.sum()=} != {num_image_features=} The input provided to the model are wrong. "
1089
+ f"The number of image tokens is {torch.sum(special_image_token_mask)} while"
1090
+ f" the number of image given to the model is {num_images}. "
1091
+ f"This prevents correct indexing and breaks batch generation."
1092
+ )
1093
+ final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
1094
+ final_attention_mask |= image_to_overwrite
1095
+ position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
1096
+
1097
+ return final_embedding, final_attention_mask, position_ids, final_labels, final_input_ids
1098
+
1099
+ def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
1100
+ """
1101
+ Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
1102
+
1103
+ Args:
1104
+ image_features (`List[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
1105
+ List of image feature tensor, each contains all the visual feature of all patches.
1106
+ image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
1107
+ Actual image size of each images (H, W).
1108
+ vision_feature_select_strategy (`str`)
1109
+ The feature selection strategy used to select the vision feature from the vision backbone.
1110
+ image_newline (`torch.Tensor` of shape `(embed_dim)`)
1111
+ New line embedding vector.
1112
+ Returns:
1113
+ image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
1114
+ feature_lens (`List[int]`)
1115
+ token length of each image in image_features
1116
+ """
1117
+ new_image_features = []
1118
+ feature_lens = []
1119
+ for image_idx, image_feature in enumerate(image_features):
1120
+ if image_feature.shape[0] > 1:
1121
+ base_image_feature = image_feature[0]
1122
+ image_feature = image_feature[1:]
1123
+ height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
1124
+
1125
+ if vision_feature_select_strategy == "default":
1126
+ expected_num_patches = height * width
1127
+ elif vision_feature_select_strategy == "full":
1128
+ expected_num_patches = height * width + 1
1129
+ if expected_num_patches != base_image_feature.shape[0]:
1130
+ raise ValueError("The number of patches is not consistent with the image size.")
1131
+
1132
+ num_patch_height, num_patch_width = get_anyres_image_grid_shape(
1133
+ image_sizes[image_idx],
1134
+ self.config.image_grid_pinpoints,
1135
+ self.config.vision_config.image_size,
1136
+ )
1137
+ image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
1138
+ image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
1139
+ image_feature = image_feature.flatten(1, 2).flatten(2, 3)
1140
+ image_feature = unpad_image(image_feature, image_sizes[image_idx])
1141
+ if image_newline is not None:
1142
+ image_feature = torch.cat(
1143
+ (
1144
+ image_feature,
1145
+ image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.dtype),
1146
+ ),
1147
+ dim=-1,
1148
+ )
1149
+ image_feature = image_feature.flatten(1, 2).transpose(0, 1)
1150
+ image_feature = torch.cat((base_image_feature, image_feature), dim=0)
1151
+ else:
1152
+ image_feature = image_feature[0]
1153
+ if image_newline is not None:
1154
+ image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
1155
+ new_image_features.append(image_feature)
1156
+ feature_lens.append(image_feature.size(0))
1157
+ image_features = torch.cat(new_image_features, dim=0)
1158
+ feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features.device)
1159
+ return image_features, feature_lens
1160
+
1161
+ def get_image_features(
1162
+ self,
1163
+ pixel_values: torch.FloatTensor,
1164
+ image_sizes: torch.Tensor,
1165
+ vision_feature_layer: int,
1166
+ vision_feature_select_strategy: str,
1167
+ ):
1168
+ """
1169
+ Obtains image last hidden states from the vision tower and apply multimodal projection.
1170
+
1171
+ Args:
1172
+ pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
1173
+ The tensors corresponding to the input images.
1174
+ image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
1175
+ Actual image size of each images (H, W).
1176
+ vision_feature_layer (`int`):
1177
+ The index of the layer to select the vision feature.
1178
+ vision_feature_select_strategy (`str`):
1179
+ The feature selection strategy used to select the vision feature from the vision backbone.
1180
+ Can be one of `"default"` or `"full"`
1181
+ Returns:
1182
+ image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
1183
+ and are of shape `(num_patches, image_length, embed_dim)`).
1184
+ """
1185
+ # ! infer image_num_patches from image_sizes
1186
+ image_num_patches = [
1187
+ image_size_to_num_patches(
1188
+ image_size=imsize,
1189
+ grid_pinpoints=self.config.image_grid_pinpoints,
1190
+ patch_size=self.config.vision_config.image_size,
1191
+ )
1192
+ for imsize in image_sizes
1193
+ ]
1194
+ if pixel_values.dim() == 5:
1195
+ # stacked if input is (batch_size, num_patches, num_channels, height, width)
1196
+ _pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)]
1197
+ pixel_values = torch.cat(_pixel_values_list, dim=0)
1198
+ elif pixel_values.dim() != 4:
1199
+ # otherwise has to be stacked from list of (num_patches, num_channels, height, width)
1200
+ raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")
1201
+
1202
+ image_features = self.vision_tower(pixel_values, output_hidden_states=True)
1203
+ selected_image_feature = image_features.hidden_states[vision_feature_layer]
1204
+ if vision_feature_select_strategy == "default":
1205
+ # selected_image_feature = selected_image_feature[:, 1:]
1206
+ # elif vision_feature_select_strategy == "full":
1207
+ selected_image_feature = selected_image_feature
1208
+ image_features, vq_loss = self.multi_modal_projector(selected_image_feature)
1209
+ image_features = torch.split(image_features, image_num_patches, dim=0)
1210
+
1211
+
1212
+ return image_features, vq_loss
1213
+
1214
+ @add_start_docstrings_to_model_forward(LLAVA_NEXT_INPUTS_DOCSTRING)
1215
+ @replace_return_docstrings(output_type=LlavaNextCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1216
+ def forward(
1217
+ self,
1218
+ input_ids: torch.LongTensor = None,
1219
+ pixel_values: torch.FloatTensor = None,
1220
+ image_sizes: Optional[torch.LongTensor] = None,
1221
+ attention_mask: Optional[torch.Tensor] = None,
1222
+ position_ids: Optional[torch.LongTensor] = None,
1223
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1224
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1225
+ vision_feature_layer: Optional[int] = None,
1226
+ vision_feature_select_strategy: Optional[str] = None,
1227
+ labels: Optional[torch.LongTensor] = None,
1228
+ use_cache: Optional[bool] = None,
1229
+ output_attentions: Optional[bool] = None,
1230
+ output_hidden_states: Optional[bool] = None,
1231
+ return_dict: Optional[bool] = None,
1232
+ cache_position: Optional[torch.LongTensor] = None,
1233
+ num_logits_to_keep: int = 0,
1234
+ ) -> Union[Tuple, LlavaNextCausalLMOutputWithPast]:
1235
+ r"""
1236
+ Args:
1237
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1238
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1239
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1240
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1241
+
1242
+ num_logits_to_keep (`int`, *optional*):
1243
+ Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
1244
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
1245
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
1246
+
1247
+ Returns:
1248
+
1249
+ Example:
1250
+
1251
+ ```python
1252
+ >>> from PIL import Image
1253
+ >>> import requests
1254
+ >>> from transformers import AutoProcessor, LlavaNextForConditionalGeneration
1255
+
1256
+ >>> model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
1257
+ >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
1258
+
1259
+ >>> prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
1260
+ >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
1261
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1262
+
1263
+ >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
1264
+
1265
+ >>> # Generate
1266
+ >>> generate_ids = model.generate(**inputs, max_length=30)
1267
+ >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1268
+ "[INST] \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot (...)"
1269
+ ```"""
1270
+
1271
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1272
+ output_hidden_states = (
1273
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1274
+ )
1275
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1276
+ vision_feature_layer = (
1277
+ vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
1278
+ )
1279
+ vision_feature_select_strategy = (
1280
+ vision_feature_select_strategy
1281
+ if vision_feature_select_strategy is not None
1282
+ else self.config.vision_feature_select_strategy
1283
+ )
1284
+
1285
+ if (input_ids is None) ^ (inputs_embeds is not None):
1286
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
1287
+
1288
+ if pixel_values is not None and inputs_embeds is not None:
1289
+ raise ValueError(
1290
+ "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
1291
+ )
1292
+
1293
+ legacy_processing = False
1294
+ if inputs_embeds is None:
1295
+ inputs_embeds = self.get_input_embeddings()(input_ids)
1296
+
1297
+ # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
1298
+ # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
1299
+ # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
1300
+ legacy_processing = (
1301
+ (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
1302
+ ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
1303
+
1304
+ image_features = None
1305
+ if pixel_values is not None and pixel_values.size(0) > 0:
1306
+ image_features, vq_loss = self.get_image_features(
1307
+ pixel_values,
1308
+ image_sizes,
1309
+ vision_feature_layer=vision_feature_layer,
1310
+ vision_feature_select_strategy=vision_feature_select_strategy,
1311
+ )
1312
+
1313
+ # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
1314
+ image_features, feature_lens = self.pack_image_features(
1315
+ image_features,
1316
+ image_sizes,
1317
+ vision_feature_select_strategy=vision_feature_select_strategy,
1318
+ image_newline=self.image_newline,
1319
+ )
1320
+
1321
+ if legacy_processing:
1322
+ logger.warning_once(
1323
+ "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
1324
+ "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
1325
+ "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
1326
+ "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
1327
+ )
1328
+ if input_ids.shape[1] != 1:
1329
+ inputs_embeds = inputs_embeds.to(image_features.dtype)
1330
+ inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features(
1331
+ image_features,
1332
+ feature_lens,
1333
+ inputs_embeds,
1334
+ input_ids,
1335
+ attention_mask,
1336
+ position_ids,
1337
+ labels=labels,
1338
+ )
1339
+ cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)
1340
+ else:
1341
+ # Retrieve the first layer to inspect the logits and mask out the hidden states
1342
+ # that are set to 0
1343
+ first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
1344
+
1345
+ # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
1346
+ batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
1347
+
1348
+ # Get the target length
1349
+ target_length = input_ids.shape[1]
1350
+ past_length = first_layer_past_key_value.shape[-1]
1351
+
1352
+ extended_attention_mask = torch.ones(
1353
+ (attention_mask.shape[0], past_length),
1354
+ dtype=attention_mask.dtype,
1355
+ device=attention_mask.device,
1356
+ )
1357
+
1358
+ # Filter out only the tokens that can be un-attended, this can happen
1359
+ # if one uses Llava + Fused modules where the cache on the
1360
+ # first iteration is already big enough, or if one passes custom cache
1361
+ valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
1362
+ new_batch_index = batch_index[valid_indices]
1363
+ new_non_attended_tokens = non_attended_tokens[valid_indices]
1364
+
1365
+ # Zero-out the places where we don't need to attend
1366
+ extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
1367
+ attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
1368
+ position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
1369
+ cache_position = torch.arange(attention_mask.shape[1], device=attention_mask.device)[-target_length:]
1370
+
1371
+ # TODO: @raushan retain only the new behavior after v4.47
1372
+ elif image_features is not None:
1373
+ n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
1374
+ n_image_features = image_features.shape[0]
1375
+ if n_image_tokens != n_image_features:
1376
+ raise ValueError(
1377
+ f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
1378
+ )
1379
+ special_image_mask = (
1380
+ (input_ids == self.config.image_token_index)
1381
+ .unsqueeze(-1)
1382
+ .expand_as(inputs_embeds)
1383
+ .to(inputs_embeds.device)
1384
+ )
1385
+ image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
1386
+ inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
1387
+
1388
+ outputs = self.language_model(
1389
+ attention_mask=attention_mask,
1390
+ position_ids=position_ids,
1391
+ past_key_values=past_key_values,
1392
+ inputs_embeds=inputs_embeds,
1393
+ use_cache=use_cache,
1394
+ output_attentions=output_attentions,
1395
+ output_hidden_states=output_hidden_states,
1396
+ return_dict=return_dict,
1397
+ cache_position=cache_position,
1398
+ num_logits_to_keep=num_logits_to_keep,
1399
+ )
1400
+
1401
+ logits = outputs[0]
1402
+
1403
+ loss = None
1404
+ if labels is not None:
1405
+ # Shift so that tokens < n predict n
1406
+ if attention_mask is not None:
1407
+ # we use the input attention mask to shift the logits and labels, because it is 2D.
1408
+ # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
1409
+ shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
1410
+ shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
1411
+ shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
1412
+ else:
1413
+ shift_logits = logits[..., :-1, :].contiguous()
1414
+ shift_labels = labels[..., 1:].contiguous()
1415
+ # Flatten the tokens
1416
+ loss_fct = nn.CrossEntropyLoss()
1417
+ loss = loss_fct(
1418
+ shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
1419
+ )
1420
+ print("This is original loss",loss)
1421
+ #vq_loss = vq_loss.to(loss.device)
1422
+ loss = loss
1423
+ if not return_dict:
1424
+ output = (logits,) + outputs[1:]
1425
+ return (loss,) + output if loss is not None else output
1426
+
1427
+ return LlavaNextCausalLMOutputWithPast(
1428
+ loss=loss,
1429
+ logits=logits,
1430
+ past_key_values=outputs.past_key_values,
1431
+ hidden_states=outputs.hidden_states,
1432
+ attentions=outputs.attentions,
1433
+ image_hidden_states=image_features if pixel_values is not None else None,
1434
+ )
1435
+
1436
+ def prepare_inputs_for_generation(
1437
+ self,
1438
+ input_ids,
1439
+ past_key_values=None,
1440
+ inputs_embeds=None,
1441
+ pixel_values=None,
1442
+ image_sizes=None,
1443
+ attention_mask=None,
1444
+ cache_position=None,
1445
+ num_logits_to_keep=None,
1446
+ **kwargs,
1447
+ ):
1448
+ # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
1449
+
1450
+ model_inputs = self.language_model.prepare_inputs_for_generation(
1451
+ input_ids,
1452
+ past_key_values=past_key_values,
1453
+ inputs_embeds=inputs_embeds,
1454
+ attention_mask=attention_mask,
1455
+ cache_position=cache_position,
1456
+ num_logits_to_keep=num_logits_to_keep,
1457
+ **kwargs,
1458
+ )
1459
+
1460
+ # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
1461
+ # Otherwise we need pixel values to be passed to model
1462
+ if cache_position[0] == 0:
1463
+ model_inputs["pixel_values"] = pixel_values
1464
+ model_inputs["image_sizes"] = image_sizes
1465
+
1466
+ return model_inputs
test_on_new_qllava.ipynb ADDED
@@ -0,0 +1,2264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from huggingface_hub import HfApi\n",
10
+ "from huggingface_hub import HfApi\n",
11
+ "api = HfApi()\n",
12
+ "# 替换为你的用户名/仓库名\n",
13
+ "repo_id = \"vincentchao/qllava-next\"\n",
14
+ "# # 创建仓库(如果还不存在)\n",
15
+ "api.create_repo(repo_id, repo_type=\"model\", private=False)\n",
16
+ "\n",
17
+ "# 上传整个目录\n",
18
+ "api.upload_folder(\n",
19
+ " folder_path=\"/common/home/users/w/wzhao/vqclip/qllava_next_newest\",\n",
20
+ " repo_id=repo_id,\n",
21
+ " repo_type=\"model\"\n",
22
+ ")"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 1,
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "classes2id = { 'neutral':0, 'porn':1,'gun':2,'cigarette':3,'alcohol':4, 'knife':5,'blood':6,'insulting_gesture':7}\n",
32
+ "id2class = ['neutral','porn','gun','cigarette','alcohol',\"knife\",'blood','insulting_gesture']"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 2,
38
+ "metadata": {},
39
+ "outputs": [
40
+ {
41
+ "data": {
42
+ "application/vnd.jupyter.widget-view+json": {
43
+ "model_id": "56dbb817e0a24197b2749a8ff82fe593",
44
+ "version_major": 2,
45
+ "version_minor": 0
46
+ },
47
+ "text/plain": [
48
+ "Loading checkpoint shards: 0%| | 0/7 [00:00<?, ?it/s]"
49
+ ]
50
+ },
51
+ "metadata": {},
52
+ "output_type": "display_data"
53
+ }
54
+ ],
55
+ "source": [
56
+ "from transformers import LlavaProcessor\n",
57
+ "import torch\n",
58
+ "import logging\n",
59
+ "from transformers import TrainerCallback\n",
60
+ "\n",
61
+ "import datetime\n",
62
+ "import os\n",
63
+ "#os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\" # 只显示第一个GPU设备\n",
64
+ "import torch # 或 tensorflow 等\n",
65
+ "import torch.nn as nn\n",
66
+ "import os\n",
67
+ "import torch.distributed as dist\n",
68
+ "from transformers import LlavaProcessor\n",
69
+ "from qllava3_test import LlavaNextForConditionalGeneration\n",
70
+ "# %%\n",
71
+ "model_id = \"/common/public/llava/llama3-llava-next-8b-hf\"\n",
72
+ "processor = LlavaProcessor.from_pretrained(\n",
73
+ " model_id ,\n",
74
+ ")\n",
75
+ "\n",
76
+ "# 加载模型\n",
77
+ "model = LlavaNextForConditionalGeneration.from_pretrained(\n",
78
+ " \"/common/home/users/w/wzhao/vqclip/qllava_next_newest\", \n",
79
+ " #\"/common/home/users/w/wzhao/qllava/vqllava/checkpoint-260\",\n",
80
+ " # \"/common/home/users/w/wzhao/\"+path + \"/checkpoint-5000\",\n",
81
+ " torch_dtype=torch.float32, # 仍然使用float16以节省内存\n",
82
+ ").cuda()"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": null,
88
+ "metadata": {},
89
+ "outputs": [],
90
+ "source": []
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": null,
95
+ "metadata": {},
96
+ "outputs": [
97
+ {
98
+ "name": "stderr",
99
+ "output_type": "stream",
100
+ "text": [
101
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
102
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
103
+ ]
104
+ },
105
+ {
106
+ "name": "stdout",
107
+ "output_type": "stream",
108
+ "text": [
109
+ "Total images to process: 21213\n",
110
+ "Processing folder: /common/home/users/w/wzhao/llava_helper/nsfw_dataset_v1/porn/train (Category: porn)\n",
111
+ "this is q_latent_loss tensor(8.8170, device='cuda:0')\n",
112
+ "This is e_latent_loss tensor(2.2042, device='cuda:0')\n",
113
+ "57\n",
114
+ "[0]\n"
115
+ ]
116
+ },
117
+ {
118
+ "name": "stderr",
119
+ "output_type": "stream",
120
+ "text": [
121
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
122
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
123
+ ]
124
+ },
125
+ {
126
+ "name": "stdout",
127
+ "output_type": "stream",
128
+ "text": [
129
+ "this is q_latent_loss tensor(11.6122, device='cuda:0')\n",
130
+ "This is e_latent_loss tensor(2.9030, device='cuda:0')\n",
131
+ "57\n",
132
+ "[0]\n",
133
+ "this is q_latent_loss tensor(9.9424, device='cuda:0')\n",
134
+ "This is e_latent_loss tensor(2.4856, device='cuda:0')\n",
135
+ "57\n",
136
+ "[0]\n",
137
+ "this is q_latent_loss tensor(11.4321, device='cuda:0')\n",
138
+ "This is e_latent_loss tensor(2.8580, device='cuda:0')\n",
139
+ "57\n",
140
+ "[0]\n"
141
+ ]
142
+ },
143
+ {
144
+ "name": "stderr",
145
+ "output_type": "stream",
146
+ "text": [
147
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
148
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
149
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
150
+ ]
151
+ },
152
+ {
153
+ "name": "stdout",
154
+ "output_type": "stream",
155
+ "text": [
156
+ "this is q_latent_loss tensor(8.0276, device='cuda:0')\n",
157
+ "This is e_latent_loss tensor(2.0069, device='cuda:0')\n",
158
+ "57\n",
159
+ "[0]\n",
160
+ "this is q_latent_loss tensor(8.3328, device='cuda:0')\n",
161
+ "This is e_latent_loss tensor(2.0832, device='cuda:0')\n",
162
+ "57\n",
163
+ "[0]\n",
164
+ "this is q_latent_loss tensor(7.4713, device='cuda:0')\n",
165
+ "This is e_latent_loss tensor(1.8678, device='cuda:0')\n",
166
+ "57\n",
167
+ "[0]\n"
168
+ ]
169
+ },
170
+ {
171
+ "name": "stderr",
172
+ "output_type": "stream",
173
+ "text": [
174
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
175
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
176
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
177
+ ]
178
+ },
179
+ {
180
+ "name": "stdout",
181
+ "output_type": "stream",
182
+ "text": [
183
+ "this is q_latent_loss tensor(9.5524, device='cuda:0')\n",
184
+ "This is e_latent_loss tensor(2.3881, device='cuda:0')\n",
185
+ "57\n",
186
+ "[0]\n",
187
+ "this is q_latent_loss tensor(10.4501, device='cuda:0')\n",
188
+ "This is e_latent_loss tensor(2.6125, device='cuda:0')\n",
189
+ "57\n",
190
+ "[0]\n",
191
+ "this is q_latent_loss tensor(7.5718, device='cuda:0')\n",
192
+ "This is e_latent_loss tensor(1.8930, device='cuda:0')\n",
193
+ "57\n",
194
+ "[0]\n",
195
+ "Processed 10/21213 images\n"
196
+ ]
197
+ },
198
+ {
199
+ "name": "stderr",
200
+ "output_type": "stream",
201
+ "text": [
202
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
203
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
204
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
205
+ ]
206
+ },
207
+ {
208
+ "name": "stdout",
209
+ "output_type": "stream",
210
+ "text": [
211
+ "this is q_latent_loss tensor(8.9837, device='cuda:0')\n",
212
+ "This is e_latent_loss tensor(2.2459, device='cuda:0')\n",
213
+ "57\n",
214
+ "[0]\n",
215
+ "this is q_latent_loss tensor(9.0015, device='cuda:0')\n",
216
+ "This is e_latent_loss tensor(2.2504, device='cuda:0')\n",
217
+ "57\n",
218
+ "[0]\n",
219
+ "this is q_latent_loss tensor(11.5371, device='cuda:0')\n",
220
+ "This is e_latent_loss tensor(2.8843, device='cuda:0')\n",
221
+ "57\n",
222
+ "[0]\n"
223
+ ]
224
+ },
225
+ {
226
+ "name": "stderr",
227
+ "output_type": "stream",
228
+ "text": [
229
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
230
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
231
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
232
+ ]
233
+ },
234
+ {
235
+ "name": "stdout",
236
+ "output_type": "stream",
237
+ "text": [
238
+ "this is q_latent_loss tensor(8.5971, device='cuda:0')\n",
239
+ "This is e_latent_loss tensor(2.1493, device='cuda:0')\n",
240
+ "57\n",
241
+ "[0]\n",
242
+ "this is q_latent_loss tensor(10.3564, device='cuda:0')\n",
243
+ "This is e_latent_loss tensor(2.5891, device='cuda:0')\n",
244
+ "57\n",
245
+ "[0]\n",
246
+ "this is q_latent_loss tensor(10.8574, device='cuda:0')\n",
247
+ "This is e_latent_loss tensor(2.7143, device='cuda:0')\n",
248
+ "57\n",
249
+ "[0]\n"
250
+ ]
251
+ },
252
+ {
253
+ "name": "stderr",
254
+ "output_type": "stream",
255
+ "text": [
256
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
257
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
258
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
259
+ ]
260
+ },
261
+ {
262
+ "name": "stdout",
263
+ "output_type": "stream",
264
+ "text": [
265
+ "this is q_latent_loss tensor(10.5497, device='cuda:0')\n",
266
+ "This is e_latent_loss tensor(2.6374, device='cuda:0')\n",
267
+ "57\n",
268
+ "[0]\n",
269
+ "this is q_latent_loss tensor(9.1719, device='cuda:0')\n",
270
+ "This is e_latent_loss tensor(2.2930, device='cuda:0')\n",
271
+ "57\n",
272
+ "[0]\n",
273
+ "this is q_latent_loss tensor(7.7135, device='cuda:0')\n",
274
+ "This is e_latent_loss tensor(1.9284, device='cuda:0')\n",
275
+ "57\n",
276
+ "[0]\n"
277
+ ]
278
+ },
279
+ {
280
+ "name": "stderr",
281
+ "output_type": "stream",
282
+ "text": [
283
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
284
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
285
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
286
+ ]
287
+ },
288
+ {
289
+ "name": "stdout",
290
+ "output_type": "stream",
291
+ "text": [
292
+ "this is q_latent_loss tensor(7.7845, device='cuda:0')\n",
293
+ "This is e_latent_loss tensor(1.9461, device='cuda:0')\n",
294
+ "57\n",
295
+ "[0]\n",
296
+ "Processed 20/21213 images\n",
297
+ "this is q_latent_loss tensor(7.4899, device='cuda:0')\n",
298
+ "This is e_latent_loss tensor(1.8725, device='cuda:0')\n",
299
+ "57\n",
300
+ "[0]\n",
301
+ "this is q_latent_loss tensor(8.1208, device='cuda:0')\n",
302
+ "This is e_latent_loss tensor(2.0302, device='cuda:0')\n",
303
+ "57\n",
304
+ "[0]\n"
305
+ ]
306
+ },
307
+ {
308
+ "name": "stderr",
309
+ "output_type": "stream",
310
+ "text": [
311
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
312
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
313
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
314
+ ]
315
+ },
316
+ {
317
+ "name": "stdout",
318
+ "output_type": "stream",
319
+ "text": [
320
+ "this is q_latent_loss tensor(7.0984, device='cuda:0')\n",
321
+ "This is e_latent_loss tensor(1.7746, device='cuda:0')\n",
322
+ "57\n",
323
+ "[0]\n",
324
+ "this is q_latent_loss tensor(7.0674, device='cuda:0')\n",
325
+ "This is e_latent_loss tensor(1.7669, device='cuda:0')\n",
326
+ "57\n",
327
+ "[0]\n",
328
+ "this is q_latent_loss tensor(13.3643, device='cuda:0')\n",
329
+ "This is e_latent_loss tensor(3.3411, device='cuda:0')\n",
330
+ "57\n",
331
+ "[0]\n"
332
+ ]
333
+ },
334
+ {
335
+ "name": "stderr",
336
+ "output_type": "stream",
337
+ "text": [
338
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
339
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
340
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
341
+ ]
342
+ },
343
+ {
344
+ "name": "stdout",
345
+ "output_type": "stream",
346
+ "text": [
347
+ "this is q_latent_loss tensor(10.2910, device='cuda:0')\n",
348
+ "This is e_latent_loss tensor(2.5727, device='cuda:0')\n",
349
+ "57\n",
350
+ "[0]\n",
351
+ "this is q_latent_loss tensor(8.9972, device='cuda:0')\n",
352
+ "This is e_latent_loss tensor(2.2493, device='cuda:0')\n",
353
+ "57\n",
354
+ "[0]\n",
355
+ "this is q_latent_loss tensor(10.9211, device='cuda:0')\n",
356
+ "This is e_latent_loss tensor(2.7303, device='cuda:0')\n",
357
+ "57\n",
358
+ "[0]\n"
359
+ ]
360
+ },
361
+ {
362
+ "name": "stderr",
363
+ "output_type": "stream",
364
+ "text": [
365
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
366
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
367
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
368
+ ]
369
+ },
370
+ {
371
+ "name": "stdout",
372
+ "output_type": "stream",
373
+ "text": [
374
+ "this is q_latent_loss tensor(10.6993, device='cuda:0')\n",
375
+ "This is e_latent_loss tensor(2.6748, device='cuda:0')\n",
376
+ "57\n",
377
+ "[0]\n",
378
+ "this is q_latent_loss tensor(6.4199, device='cuda:0')\n",
379
+ "This is e_latent_loss tensor(1.6050, device='cuda:0')\n",
380
+ "57\n",
381
+ "[0]\n",
382
+ "Processed 30/21213 images\n",
383
+ "this is q_latent_loss tensor(9.1634, device='cuda:0')\n",
384
+ "This is e_latent_loss tensor(2.2909, device='cuda:0')\n",
385
+ "57\n",
386
+ "[0]\n"
387
+ ]
388
+ },
389
+ {
390
+ "name": "stderr",
391
+ "output_type": "stream",
392
+ "text": [
393
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
394
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
395
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
396
+ ]
397
+ },
398
+ {
399
+ "name": "stdout",
400
+ "output_type": "stream",
401
+ "text": [
402
+ "this is q_latent_loss tensor(11.9688, device='cuda:0')\n",
403
+ "This is e_latent_loss tensor(2.9922, device='cuda:0')\n",
404
+ "57\n",
405
+ "[0]\n",
406
+ "this is q_latent_loss tensor(9.6793, device='cuda:0')\n",
407
+ "This is e_latent_loss tensor(2.4198, device='cuda:0')\n",
408
+ "57\n",
409
+ "[0]\n",
410
+ "this is q_latent_loss tensor(10.4200, device='cuda:0')\n",
411
+ "This is e_latent_loss tensor(2.6050, device='cuda:0')\n",
412
+ "57\n",
413
+ "[0]\n"
414
+ ]
415
+ },
416
+ {
417
+ "name": "stderr",
418
+ "output_type": "stream",
419
+ "text": [
420
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
421
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
422
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
423
+ ]
424
+ },
425
+ {
426
+ "name": "stdout",
427
+ "output_type": "stream",
428
+ "text": [
429
+ "this is q_latent_loss tensor(10.4276, device='cuda:0')\n",
430
+ "This is e_latent_loss tensor(2.6069, device='cuda:0')\n",
431
+ "57\n",
432
+ "[0]\n",
433
+ "this is q_latent_loss tensor(10.8010, device='cuda:0')\n",
434
+ "This is e_latent_loss tensor(2.7003, device='cuda:0')\n",
435
+ "57\n",
436
+ "[0]\n",
437
+ "this is q_latent_loss tensor(7.6418, device='cuda:0')\n",
438
+ "This is e_latent_loss tensor(1.9105, device='cuda:0')\n",
439
+ "57\n",
440
+ "[0]\n"
441
+ ]
442
+ },
443
+ {
444
+ "name": "stderr",
445
+ "output_type": "stream",
446
+ "text": [
447
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
448
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
449
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
450
+ ]
451
+ },
452
+ {
453
+ "name": "stdout",
454
+ "output_type": "stream",
455
+ "text": [
456
+ "this is q_latent_loss tensor(8.3313, device='cuda:0')\n",
457
+ "This is e_latent_loss tensor(2.0828, device='cuda:0')\n",
458
+ "57\n",
459
+ "[0]\n",
460
+ "this is q_latent_loss tensor(9.5423, device='cuda:0')\n",
461
+ "This is e_latent_loss tensor(2.3856, device='cuda:0')\n",
462
+ "57\n",
463
+ "[0]\n",
464
+ "this is q_latent_loss tensor(8.8121, device='cuda:0')\n",
465
+ "This is e_latent_loss tensor(2.2030, device='cuda:0')\n",
466
+ "57\n",
467
+ "[0]\n",
468
+ "Processed 40/21213 images\n"
469
+ ]
470
+ },
471
+ {
472
+ "name": "stderr",
473
+ "output_type": "stream",
474
+ "text": [
475
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
476
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
477
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
478
+ ]
479
+ },
480
+ {
481
+ "name": "stdout",
482
+ "output_type": "stream",
483
+ "text": [
484
+ "this is q_latent_loss tensor(12.9253, device='cuda:0')\n",
485
+ "This is e_latent_loss tensor(3.2313, device='cuda:0')\n",
486
+ "57\n",
487
+ "[0]\n",
488
+ "this is q_latent_loss tensor(8.9912, device='cuda:0')\n",
489
+ "This is e_latent_loss tensor(2.2478, device='cuda:0')\n",
490
+ "57\n",
491
+ "[0]\n",
492
+ "this is q_latent_loss tensor(6.2949, device='cuda:0')\n",
493
+ "This is e_latent_loss tensor(1.5737, device='cuda:0')\n",
494
+ "57\n",
495
+ "[0]\n"
496
+ ]
497
+ },
498
+ {
499
+ "name": "stderr",
500
+ "output_type": "stream",
501
+ "text": [
502
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
503
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
504
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
505
+ ]
506
+ },
507
+ {
508
+ "name": "stdout",
509
+ "output_type": "stream",
510
+ "text": [
511
+ "this is q_latent_loss tensor(14.0637, device='cuda:0')\n",
512
+ "This is e_latent_loss tensor(3.5159, device='cuda:0')\n",
513
+ "57\n",
514
+ "[0]\n",
515
+ "this is q_latent_loss tensor(11.6149, device='cuda:0')\n",
516
+ "This is e_latent_loss tensor(2.9037, device='cuda:0')\n",
517
+ "57\n",
518
+ "[0]\n",
519
+ "this is q_latent_loss tensor(12.6029, device='cuda:0')\n",
520
+ "This is e_latent_loss tensor(3.1507, device='cuda:0')\n",
521
+ "57\n",
522
+ "[0]\n"
523
+ ]
524
+ },
525
+ {
526
+ "name": "stderr",
527
+ "output_type": "stream",
528
+ "text": [
529
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
530
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
531
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
532
+ ]
533
+ },
534
+ {
535
+ "name": "stdout",
536
+ "output_type": "stream",
537
+ "text": [
538
+ "this is q_latent_loss tensor(13.2988, device='cuda:0')\n",
539
+ "This is e_latent_loss tensor(3.3247, device='cuda:0')\n",
540
+ "57\n",
541
+ "[0]\n",
542
+ "this is q_latent_loss tensor(13.8039, device='cuda:0')\n",
543
+ "This is e_latent_loss tensor(3.4510, device='cuda:0')\n",
544
+ "57\n",
545
+ "[0]\n",
546
+ "this is q_latent_loss tensor(8.4613, device='cuda:0')\n",
547
+ "This is e_latent_loss tensor(2.1153, device='cuda:0')\n",
548
+ "57\n",
549
+ "[0]\n"
550
+ ]
551
+ },
552
+ {
553
+ "name": "stderr",
554
+ "output_type": "stream",
555
+ "text": [
556
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
557
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
558
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
559
+ ]
560
+ },
561
+ {
562
+ "name": "stdout",
563
+ "output_type": "stream",
564
+ "text": [
565
+ "this is q_latent_loss tensor(12.6695, device='cuda:0')\n",
566
+ "This is e_latent_loss tensor(3.1674, device='cuda:0')\n",
567
+ "57\n",
568
+ "[0]\n",
569
+ "Processed 50/21213 images\n",
570
+ "this is q_latent_loss tensor(9.4005, device='cuda:0')\n",
571
+ "This is e_latent_loss tensor(2.3501, device='cuda:0')\n",
572
+ "57\n",
573
+ "[0]\n",
574
+ "this is q_latent_loss tensor(12.8682, device='cuda:0')\n",
575
+ "This is e_latent_loss tensor(3.2171, device='cuda:0')\n",
576
+ "57\n",
577
+ "[0]\n"
578
+ ]
579
+ },
580
+ {
581
+ "name": "stderr",
582
+ "output_type": "stream",
583
+ "text": [
584
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
585
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
586
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
587
+ ]
588
+ },
589
+ {
590
+ "name": "stdout",
591
+ "output_type": "stream",
592
+ "text": [
593
+ "this is q_latent_loss tensor(8.6948, device='cuda:0')\n",
594
+ "This is e_latent_loss tensor(2.1737, device='cuda:0')\n",
595
+ "57\n",
596
+ "[0]\n",
597
+ "this is q_latent_loss tensor(8.5272, device='cuda:0')\n",
598
+ "This is e_latent_loss tensor(2.1318, device='cuda:0')\n",
599
+ "57\n",
600
+ "[0]\n",
601
+ "this is q_latent_loss tensor(8.7591, device='cuda:0')\n",
602
+ "This is e_latent_loss tensor(2.1898, device='cuda:0')\n",
603
+ "57\n",
604
+ "[0]\n"
605
+ ]
606
+ },
607
+ {
608
+ "name": "stderr",
609
+ "output_type": "stream",
610
+ "text": [
611
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
612
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
613
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
614
+ ]
615
+ },
616
+ {
617
+ "name": "stdout",
618
+ "output_type": "stream",
619
+ "text": [
620
+ "this is q_latent_loss tensor(10.1201, device='cuda:0')\n",
621
+ "This is e_latent_loss tensor(2.5300, device='cuda:0')\n",
622
+ "57\n",
623
+ "[0]\n",
624
+ "this is q_latent_loss tensor(10.5750, device='cuda:0')\n",
625
+ "This is e_latent_loss tensor(2.6437, device='cuda:0')\n",
626
+ "57\n",
627
+ "[0]\n",
628
+ "this is q_latent_loss tensor(7.1127, device='cuda:0')\n",
629
+ "This is e_latent_loss tensor(1.7782, device='cuda:0')\n",
630
+ "57\n",
631
+ "[0]\n"
632
+ ]
633
+ },
634
+ {
635
+ "name": "stderr",
636
+ "output_type": "stream",
637
+ "text": [
638
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
639
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
640
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
641
+ ]
642
+ },
643
+ {
644
+ "name": "stdout",
645
+ "output_type": "stream",
646
+ "text": [
647
+ "this is q_latent_loss tensor(9.6827, device='cuda:0')\n",
648
+ "This is e_latent_loss tensor(2.4207, device='cuda:0')\n",
649
+ "57\n",
650
+ "[0]\n",
651
+ "this is q_latent_loss tensor(11.4962, device='cuda:0')\n",
652
+ "This is e_latent_loss tensor(2.8740, device='cuda:0')\n",
653
+ "57\n",
654
+ "[0]\n",
655
+ "Processed 60/21213 images\n",
656
+ "this is q_latent_loss tensor(9.2464, device='cuda:0')\n",
657
+ "This is e_latent_loss tensor(2.3116, device='cuda:0')\n",
658
+ "57\n",
659
+ "[0]\n"
660
+ ]
661
+ },
662
+ {
663
+ "name": "stderr",
664
+ "output_type": "stream",
665
+ "text": [
666
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
667
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
668
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
669
+ ]
670
+ },
671
+ {
672
+ "name": "stdout",
673
+ "output_type": "stream",
674
+ "text": [
675
+ "this is q_latent_loss tensor(9.1726, device='cuda:0')\n",
676
+ "This is e_latent_loss tensor(2.2932, device='cuda:0')\n",
677
+ "57\n",
678
+ "[0]\n",
679
+ "this is q_latent_loss tensor(7.0186, device='cuda:0')\n",
680
+ "This is e_latent_loss tensor(1.7546, device='cuda:0')\n",
681
+ "57\n",
682
+ "[0]\n"
683
+ ]
684
+ },
685
+ {
686
+ "name": "stderr",
687
+ "output_type": "stream",
688
+ "text": [
689
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
690
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
691
+ ]
692
+ },
693
+ {
694
+ "name": "stdout",
695
+ "output_type": "stream",
696
+ "text": [
697
+ "this is q_latent_loss tensor(9.8717, device='cuda:0')\n",
698
+ "This is e_latent_loss tensor(2.4679, device='cuda:0')\n",
699
+ "57\n",
700
+ "[0]\n",
701
+ "this is q_latent_loss tensor(14.1204, device='cuda:0')\n",
702
+ "This is e_latent_loss tensor(3.5301, device='cuda:0')\n",
703
+ "57\n",
704
+ "[0]\n"
705
+ ]
706
+ },
707
+ {
708
+ "name": "stderr",
709
+ "output_type": "stream",
710
+ "text": [
711
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
712
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
713
+ ]
714
+ },
715
+ {
716
+ "name": "stdout",
717
+ "output_type": "stream",
718
+ "text": [
719
+ "this is q_latent_loss tensor(8.3599, device='cuda:0')\n",
720
+ "This is e_latent_loss tensor(2.0900, device='cuda:0')\n",
721
+ "57\n",
722
+ "[0]\n",
723
+ "this is q_latent_loss tensor(12.3845, device='cuda:0')\n",
724
+ "This is e_latent_loss tensor(3.0961, device='cuda:0')\n",
725
+ "57\n",
726
+ "[0]\n"
727
+ ]
728
+ },
729
+ {
730
+ "name": "stderr",
731
+ "output_type": "stream",
732
+ "text": [
733
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
734
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
735
+ ]
736
+ },
737
+ {
738
+ "name": "stdout",
739
+ "output_type": "stream",
740
+ "text": [
741
+ "this is q_latent_loss tensor(11.8901, device='cuda:0')\n",
742
+ "This is e_latent_loss tensor(2.9725, device='cuda:0')\n",
743
+ "57\n",
744
+ "[0]\n",
745
+ "this is q_latent_loss tensor(12.0965, device='cuda:0')\n",
746
+ "This is e_latent_loss tensor(3.0241, device='cuda:0')\n",
747
+ "57\n",
748
+ "[0]\n"
749
+ ]
750
+ },
751
+ {
752
+ "name": "stderr",
753
+ "output_type": "stream",
754
+ "text": [
755
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
756
+ ]
757
+ },
758
+ {
759
+ "name": "stdout",
760
+ "output_type": "stream",
761
+ "text": [
762
+ "this is q_latent_loss tensor(8.8583, device='cuda:0')\n",
763
+ "This is e_latent_loss tensor(2.2146, device='cuda:0')\n",
764
+ "57\n",
765
+ "[0]\n",
766
+ "Processed 70/21213 images\n"
767
+ ]
768
+ },
769
+ {
770
+ "name": "stderr",
771
+ "output_type": "stream",
772
+ "text": [
773
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
774
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
775
+ ]
776
+ },
777
+ {
778
+ "name": "stdout",
779
+ "output_type": "stream",
780
+ "text": [
781
+ "this is q_latent_loss tensor(11.9591, device='cuda:0')\n",
782
+ "This is e_latent_loss tensor(2.9898, device='cuda:0')\n",
783
+ "57\n",
784
+ "[0]\n",
785
+ "this is q_latent_loss tensor(8.9921, device='cuda:0')\n",
786
+ "This is e_latent_loss tensor(2.2480, device='cuda:0')\n",
787
+ "57\n",
788
+ "[0]\n"
789
+ ]
790
+ },
791
+ {
792
+ "name": "stderr",
793
+ "output_type": "stream",
794
+ "text": [
795
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
796
+ ]
797
+ },
798
+ {
799
+ "name": "stdout",
800
+ "output_type": "stream",
801
+ "text": [
802
+ "this is q_latent_loss tensor(7.3415, device='cuda:0')\n",
803
+ "This is e_latent_loss tensor(1.8354, device='cuda:0')\n",
804
+ "57\n",
805
+ "[0]\n"
806
+ ]
807
+ },
808
+ {
809
+ "name": "stderr",
810
+ "output_type": "stream",
811
+ "text": [
812
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
813
+ ]
814
+ },
815
+ {
816
+ "name": "stdout",
817
+ "output_type": "stream",
818
+ "text": [
819
+ "this is q_latent_loss tensor(9.7903, device='cuda:0')\n",
820
+ "This is e_latent_loss tensor(2.4476, device='cuda:0')\n",
821
+ "57\n",
822
+ "[0]\n"
823
+ ]
824
+ },
825
+ {
826
+ "name": "stderr",
827
+ "output_type": "stream",
828
+ "text": [
829
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
830
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
831
+ ]
832
+ },
833
+ {
834
+ "name": "stdout",
835
+ "output_type": "stream",
836
+ "text": [
837
+ "this is q_latent_loss tensor(8.8003, device='cuda:0')\n",
838
+ "This is e_latent_loss tensor(2.2001, device='cuda:0')\n",
839
+ "57\n",
840
+ "[0]\n",
841
+ "this is q_latent_loss tensor(6.3422, device='cuda:0')\n",
842
+ "This is e_latent_loss tensor(1.5856, device='cuda:0')\n",
843
+ "57\n",
844
+ "[0]\n"
845
+ ]
846
+ },
847
+ {
848
+ "name": "stderr",
849
+ "output_type": "stream",
850
+ "text": [
851
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
852
+ ]
853
+ },
854
+ {
855
+ "name": "stdout",
856
+ "output_type": "stream",
857
+ "text": [
858
+ "this is q_latent_loss tensor(8.4356, device='cuda:0')\n",
859
+ "This is e_latent_loss tensor(2.1089, device='cuda:0')\n",
860
+ "57\n",
861
+ "[0]\n",
862
+ "this is q_latent_loss tensor(9.7969, device='cuda:0')\n",
863
+ "This is e_latent_loss tensor(2.4492, device='cuda:0')\n",
864
+ "57\n",
865
+ "[0]\n"
866
+ ]
867
+ },
868
+ {
869
+ "name": "stderr",
870
+ "output_type": "stream",
871
+ "text": [
872
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
873
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
874
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
875
+ ]
876
+ },
877
+ {
878
+ "name": "stdout",
879
+ "output_type": "stream",
880
+ "text": [
881
+ "this is q_latent_loss tensor(10.9973, device='cuda:0')\n",
882
+ "This is e_latent_loss tensor(2.7493, device='cuda:0')\n",
883
+ "57\n",
884
+ "[0]\n",
885
+ "this is q_latent_loss tensor(8.3360, device='cuda:0')\n",
886
+ "This is e_latent_loss tensor(2.0840, device='cuda:0')\n",
887
+ "57\n",
888
+ "[0]\n",
889
+ "Processed 80/21213 images\n"
890
+ ]
891
+ },
892
+ {
893
+ "name": "stderr",
894
+ "output_type": "stream",
895
+ "text": [
896
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
897
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
898
+ ]
899
+ },
900
+ {
901
+ "name": "stdout",
902
+ "output_type": "stream",
903
+ "text": [
904
+ "this is q_latent_loss tensor(11.2751, device='cuda:0')\n",
905
+ "This is e_latent_loss tensor(2.8188, device='cuda:0')\n",
906
+ "57\n",
907
+ "[0]\n",
908
+ "this is q_latent_loss tensor(8.1294, device='cuda:0')\n",
909
+ "This is e_latent_loss tensor(2.0324, device='cuda:0')\n",
910
+ "57\n",
911
+ "[0]\n"
912
+ ]
913
+ },
914
+ {
915
+ "name": "stderr",
916
+ "output_type": "stream",
917
+ "text": [
918
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
919
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
920
+ ]
921
+ },
922
+ {
923
+ "name": "stdout",
924
+ "output_type": "stream",
925
+ "text": [
926
+ "this is q_latent_loss tensor(7.8635, device='cuda:0')\n",
927
+ "This is e_latent_loss tensor(1.9659, device='cuda:0')\n",
928
+ "57\n",
929
+ "[0]\n",
930
+ "this is q_latent_loss tensor(7.8551, device='cuda:0')\n",
931
+ "This is e_latent_loss tensor(1.9638, device='cuda:0')\n",
932
+ "57\n",
933
+ "[0]\n"
934
+ ]
935
+ },
936
+ {
937
+ "name": "stderr",
938
+ "output_type": "stream",
939
+ "text": [
940
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
941
+ ]
942
+ },
943
+ {
944
+ "name": "stdout",
945
+ "output_type": "stream",
946
+ "text": [
947
+ "this is q_latent_loss tensor(9.8822, device='cuda:0')\n",
948
+ "This is e_latent_loss tensor(2.4705, device='cuda:0')\n",
949
+ "57\n",
950
+ "[0]\n"
951
+ ]
952
+ },
953
+ {
954
+ "name": "stderr",
955
+ "output_type": "stream",
956
+ "text": [
957
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
958
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
959
+ ]
960
+ },
961
+ {
962
+ "name": "stdout",
963
+ "output_type": "stream",
964
+ "text": [
965
+ "this is q_latent_loss tensor(8.0291, device='cuda:0')\n",
966
+ "This is e_latent_loss tensor(2.0073, device='cuda:0')\n",
967
+ "57\n",
968
+ "[0]\n",
969
+ "this is q_latent_loss tensor(9.0501, device='cuda:0')\n",
970
+ "This is e_latent_loss tensor(2.2625, device='cuda:0')\n",
971
+ "57\n",
972
+ "[0]\n"
973
+ ]
974
+ },
975
+ {
976
+ "name": "stderr",
977
+ "output_type": "stream",
978
+ "text": [
979
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
980
+ ]
981
+ },
982
+ {
983
+ "name": "stdout",
984
+ "output_type": "stream",
985
+ "text": [
986
+ "this is q_latent_loss tensor(12.6947, device='cuda:0')\n",
987
+ "This is e_latent_loss tensor(3.1737, device='cuda:0')\n",
988
+ "57\n",
989
+ "[0]\n"
990
+ ]
991
+ },
992
+ {
993
+ "name": "stderr",
994
+ "output_type": "stream",
995
+ "text": [
996
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
997
+ ]
998
+ },
999
+ {
1000
+ "name": "stdout",
1001
+ "output_type": "stream",
1002
+ "text": [
1003
+ "this is q_latent_loss tensor(9.9940, device='cuda:0')\n",
1004
+ "This is e_latent_loss tensor(2.4985, device='cuda:0')\n",
1005
+ "57\n",
1006
+ "[0]\n"
1007
+ ]
1008
+ },
1009
+ {
1010
+ "name": "stderr",
1011
+ "output_type": "stream",
1012
+ "text": [
1013
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1014
+ ]
1015
+ },
1016
+ {
1017
+ "name": "stdout",
1018
+ "output_type": "stream",
1019
+ "text": [
1020
+ "this is q_latent_loss tensor(7.2904, device='cuda:0')\n",
1021
+ "This is e_latent_loss tensor(1.8226, device='cuda:0')\n",
1022
+ "57\n",
1023
+ "[0]\n",
1024
+ "Processed 90/21213 images\n",
1025
+ "this is q_latent_loss tensor(9.6648, device='cuda:0')\n",
1026
+ "This is e_latent_loss tensor(2.4162, device='cuda:0')\n",
1027
+ "57\n",
1028
+ "[0]\n"
1029
+ ]
1030
+ },
1031
+ {
1032
+ "name": "stderr",
1033
+ "output_type": "stream",
1034
+ "text": [
1035
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1036
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1037
+ ]
1038
+ },
1039
+ {
1040
+ "name": "stdout",
1041
+ "output_type": "stream",
1042
+ "text": [
1043
+ "this is q_latent_loss tensor(9.5542, device='cuda:0')\n",
1044
+ "This is e_latent_loss tensor(2.3886, device='cuda:0')\n",
1045
+ "57\n",
1046
+ "[0]\n"
1047
+ ]
1048
+ },
1049
+ {
1050
+ "name": "stderr",
1051
+ "output_type": "stream",
1052
+ "text": [
1053
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1054
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1055
+ ]
1056
+ },
1057
+ {
1058
+ "name": "stdout",
1059
+ "output_type": "stream",
1060
+ "text": [
1061
+ "this is q_latent_loss tensor(9.8552, device='cuda:0')\n",
1062
+ "This is e_latent_loss tensor(2.4638, device='cuda:0')\n",
1063
+ "57\n",
1064
+ "[0]\n",
1065
+ "this is q_latent_loss tensor(8.3666, device='cuda:0')\n",
1066
+ "This is e_latent_loss tensor(2.0916, device='cuda:0')\n",
1067
+ "57\n",
1068
+ "[0]\n"
1069
+ ]
1070
+ },
1071
+ {
1072
+ "name": "stderr",
1073
+ "output_type": "stream",
1074
+ "text": [
1075
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1076
+ ]
1077
+ },
1078
+ {
1079
+ "name": "stdout",
1080
+ "output_type": "stream",
1081
+ "text": [
1082
+ "this is q_latent_loss tensor(9.7352, device='cuda:0')\n",
1083
+ "This is e_latent_loss tensor(2.4338, device='cuda:0')\n",
1084
+ "57\n",
1085
+ "[0]\n"
1086
+ ]
1087
+ },
1088
+ {
1089
+ "name": "stderr",
1090
+ "output_type": "stream",
1091
+ "text": [
1092
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1093
+ ]
1094
+ },
1095
+ {
1096
+ "name": "stdout",
1097
+ "output_type": "stream",
1098
+ "text": [
1099
+ "this is q_latent_loss tensor(8.1629, device='cuda:0')\n",
1100
+ "This is e_latent_loss tensor(2.0407, device='cuda:0')\n",
1101
+ "57\n",
1102
+ "[0]\n",
1103
+ "this is q_latent_loss tensor(14.1190, device='cuda:0')\n",
1104
+ "This is e_latent_loss tensor(3.5297, device='cuda:0')\n",
1105
+ "57\n",
1106
+ "[0]\n"
1107
+ ]
1108
+ },
1109
+ {
1110
+ "name": "stderr",
1111
+ "output_type": "stream",
1112
+ "text": [
1113
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1114
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1115
+ ]
1116
+ },
1117
+ {
1118
+ "name": "stdout",
1119
+ "output_type": "stream",
1120
+ "text": [
1121
+ "this is q_latent_loss tensor(11.3341, device='cuda:0')\n",
1122
+ "This is e_latent_loss tensor(2.8335, device='cuda:0')\n",
1123
+ "57\n",
1124
+ "[0]\n"
1125
+ ]
1126
+ },
1127
+ {
1128
+ "name": "stderr",
1129
+ "output_type": "stream",
1130
+ "text": [
1131
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1132
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1133
+ ]
1134
+ },
1135
+ {
1136
+ "name": "stdout",
1137
+ "output_type": "stream",
1138
+ "text": [
1139
+ "this is q_latent_loss tensor(8.9301, device='cuda:0')\n",
1140
+ "This is e_latent_loss tensor(2.2325, device='cuda:0')\n",
1141
+ "57\n",
1142
+ "[0]\n",
1143
+ "this is q_latent_loss tensor(6.2145, device='cuda:0')\n",
1144
+ "This is e_latent_loss tensor(1.5536, device='cuda:0')\n",
1145
+ "57\n",
1146
+ "[0]\n",
1147
+ "Processed 100/21213 images\n"
1148
+ ]
1149
+ },
1150
+ {
1151
+ "name": "stderr",
1152
+ "output_type": "stream",
1153
+ "text": [
1154
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1155
+ ]
1156
+ },
1157
+ {
1158
+ "name": "stdout",
1159
+ "output_type": "stream",
1160
+ "text": [
1161
+ "this is q_latent_loss tensor(8.1991, device='cuda:0')\n",
1162
+ "This is e_latent_loss tensor(2.0498, device='cuda:0')\n",
1163
+ "57\n",
1164
+ "[0]\n"
1165
+ ]
1166
+ },
1167
+ {
1168
+ "name": "stderr",
1169
+ "output_type": "stream",
1170
+ "text": [
1171
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1172
+ ]
1173
+ },
1174
+ {
1175
+ "name": "stdout",
1176
+ "output_type": "stream",
1177
+ "text": [
1178
+ "this is q_latent_loss tensor(11.6093, device='cuda:0')\n",
1179
+ "This is e_latent_loss tensor(2.9023, device='cuda:0')\n",
1180
+ "57\n",
1181
+ "[0]\n"
1182
+ ]
1183
+ },
1184
+ {
1185
+ "name": "stderr",
1186
+ "output_type": "stream",
1187
+ "text": [
1188
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1189
+ ]
1190
+ },
1191
+ {
1192
+ "name": "stdout",
1193
+ "output_type": "stream",
1194
+ "text": [
1195
+ "this is q_latent_loss tensor(9.7087, device='cuda:0')\n",
1196
+ "This is e_latent_loss tensor(2.4272, device='cuda:0')\n",
1197
+ "57\n",
1198
+ "[0]\n"
1199
+ ]
1200
+ },
1201
+ {
1202
+ "name": "stderr",
1203
+ "output_type": "stream",
1204
+ "text": [
1205
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1206
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1207
+ ]
1208
+ },
1209
+ {
1210
+ "name": "stdout",
1211
+ "output_type": "stream",
1212
+ "text": [
1213
+ "this is q_latent_loss tensor(8.8062, device='cuda:0')\n",
1214
+ "This is e_latent_loss tensor(2.2016, device='cuda:0')\n",
1215
+ "57\n",
1216
+ "[0]\n",
1217
+ "this is q_latent_loss tensor(8.4319, device='cuda:0')\n",
1218
+ "This is e_latent_loss tensor(2.1080, device='cuda:0')\n",
1219
+ "57\n",
1220
+ "[0]\n"
1221
+ ]
1222
+ },
1223
+ {
1224
+ "name": "stderr",
1225
+ "output_type": "stream",
1226
+ "text": [
1227
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1228
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1229
+ ]
1230
+ },
1231
+ {
1232
+ "name": "stdout",
1233
+ "output_type": "stream",
1234
+ "text": [
1235
+ "this is q_latent_loss tensor(8.5253, device='cuda:0')\n",
1236
+ "This is e_latent_loss tensor(2.1313, device='cuda:0')\n",
1237
+ "57\n",
1238
+ "[0]\n",
1239
+ "this is q_latent_loss tensor(8.1614, device='cuda:0')\n",
1240
+ "This is e_latent_loss tensor(2.0404, device='cuda:0')\n",
1241
+ "57\n",
1242
+ "[0]\n"
1243
+ ]
1244
+ },
1245
+ {
1246
+ "name": "stderr",
1247
+ "output_type": "stream",
1248
+ "text": [
1249
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1250
+ ]
1251
+ },
1252
+ {
1253
+ "name": "stdout",
1254
+ "output_type": "stream",
1255
+ "text": [
1256
+ "this is q_latent_loss tensor(9.9642, device='cuda:0')\n",
1257
+ "This is e_latent_loss tensor(2.4910, device='cuda:0')\n",
1258
+ "57\n",
1259
+ "[0]\n"
1260
+ ]
1261
+ },
1262
+ {
1263
+ "name": "stderr",
1264
+ "output_type": "stream",
1265
+ "text": [
1266
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1267
+ ]
1268
+ },
1269
+ {
1270
+ "name": "stdout",
1271
+ "output_type": "stream",
1272
+ "text": [
1273
+ "this is q_latent_loss tensor(9.2686, device='cuda:0')\n",
1274
+ "This is e_latent_loss tensor(2.3171, device='cuda:0')\n",
1275
+ "57\n",
1276
+ "[0]\n"
1277
+ ]
1278
+ },
1279
+ {
1280
+ "name": "stderr",
1281
+ "output_type": "stream",
1282
+ "text": [
1283
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1284
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1285
+ ]
1286
+ },
1287
+ {
1288
+ "name": "stdout",
1289
+ "output_type": "stream",
1290
+ "text": [
1291
+ "this is q_latent_loss tensor(10.1140, device='cuda:0')\n",
1292
+ "This is e_latent_loss tensor(2.5285, device='cuda:0')\n",
1293
+ "57\n",
1294
+ "[0]\n",
1295
+ "Processed 110/21213 images\n",
1296
+ "this is q_latent_loss tensor(12.3632, device='cuda:0')\n",
1297
+ "This is e_latent_loss tensor(3.0908, device='cuda:0')\n",
1298
+ "57\n",
1299
+ "[0]\n",
1300
+ "this is q_latent_loss tensor(8.1329, device='cuda:0')\n",
1301
+ "This is e_latent_loss tensor(2.0332, device='cuda:0')\n",
1302
+ "57\n",
1303
+ "[0]\n"
1304
+ ]
1305
+ },
1306
+ {
1307
+ "name": "stderr",
1308
+ "output_type": "stream",
1309
+ "text": [
1310
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1311
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1312
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1313
+ ]
1314
+ },
1315
+ {
1316
+ "name": "stdout",
1317
+ "output_type": "stream",
1318
+ "text": [
1319
+ "this is q_latent_loss tensor(8.9918, device='cuda:0')\n",
1320
+ "This is e_latent_loss tensor(2.2479, device='cuda:0')\n",
1321
+ "57\n",
1322
+ "[0]\n",
1323
+ "this is q_latent_loss tensor(7.1709, device='cuda:0')\n",
1324
+ "This is e_latent_loss tensor(1.7927, device='cuda:0')\n",
1325
+ "57\n",
1326
+ "[0]\n",
1327
+ "this is q_latent_loss tensor(10.1411, device='cuda:0')\n",
1328
+ "This is e_latent_loss tensor(2.5353, device='cuda:0')\n",
1329
+ "57\n",
1330
+ "[0]\n"
1331
+ ]
1332
+ },
1333
+ {
1334
+ "name": "stderr",
1335
+ "output_type": "stream",
1336
+ "text": [
1337
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1338
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1339
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1340
+ ]
1341
+ },
1342
+ {
1343
+ "name": "stdout",
1344
+ "output_type": "stream",
1345
+ "text": [
1346
+ "this is q_latent_loss tensor(10.4103, device='cuda:0')\n",
1347
+ "This is e_latent_loss tensor(2.6026, device='cuda:0')\n",
1348
+ "57\n",
1349
+ "[0]\n",
1350
+ "this is q_latent_loss tensor(9.9719, device='cuda:0')\n",
1351
+ "This is e_latent_loss tensor(2.4930, device='cuda:0')\n",
1352
+ "57\n",
1353
+ "[0]\n",
1354
+ "this is q_latent_loss tensor(6.5009, device='cuda:0')\n",
1355
+ "This is e_latent_loss tensor(1.6252, device='cuda:0')\n",
1356
+ "57\n",
1357
+ "[0]\n"
1358
+ ]
1359
+ },
1360
+ {
1361
+ "name": "stderr",
1362
+ "output_type": "stream",
1363
+ "text": [
1364
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1365
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1366
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1367
+ ]
1368
+ },
1369
+ {
1370
+ "name": "stdout",
1371
+ "output_type": "stream",
1372
+ "text": [
1373
+ "this is q_latent_loss tensor(7.8178, device='cuda:0')\n",
1374
+ "This is e_latent_loss tensor(1.9544, device='cuda:0')\n",
1375
+ "57\n",
1376
+ "[0]\n",
1377
+ "this is q_latent_loss tensor(12.2491, device='cuda:0')\n",
1378
+ "This is e_latent_loss tensor(3.0623, device='cuda:0')\n",
1379
+ "57\n",
1380
+ "[0]\n",
1381
+ "Processed 120/21213 images\n",
1382
+ "this is q_latent_loss tensor(11.7427, device='cuda:0')\n",
1383
+ "This is e_latent_loss tensor(2.9357, device='cuda:0')\n",
1384
+ "57\n",
1385
+ "[0]\n"
1386
+ ]
1387
+ },
1388
+ {
1389
+ "name": "stderr",
1390
+ "output_type": "stream",
1391
+ "text": [
1392
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1393
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1394
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1395
+ ]
1396
+ },
1397
+ {
1398
+ "name": "stdout",
1399
+ "output_type": "stream",
1400
+ "text": [
1401
+ "this is q_latent_loss tensor(8.0834, device='cuda:0')\n",
1402
+ "This is e_latent_loss tensor(2.0208, device='cuda:0')\n",
1403
+ "57\n",
1404
+ "[0]\n",
1405
+ "this is q_latent_loss tensor(12.1786, device='cuda:0')\n",
1406
+ "This is e_latent_loss tensor(3.0447, device='cuda:0')\n",
1407
+ "57\n",
1408
+ "[0]\n",
1409
+ "this is q_latent_loss tensor(9.7889, device='cuda:0')\n",
1410
+ "This is e_latent_loss tensor(2.4472, device='cuda:0')\n",
1411
+ "57\n",
1412
+ "[0]\n"
1413
+ ]
1414
+ },
1415
+ {
1416
+ "name": "stderr",
1417
+ "output_type": "stream",
1418
+ "text": [
1419
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1420
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1421
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1422
+ ]
1423
+ },
1424
+ {
1425
+ "name": "stdout",
1426
+ "output_type": "stream",
1427
+ "text": [
1428
+ "this is q_latent_loss tensor(8.3587, device='cuda:0')\n",
1429
+ "This is e_latent_loss tensor(2.0897, device='cuda:0')\n",
1430
+ "57\n",
1431
+ "[0]\n",
1432
+ "this is q_latent_loss tensor(8.1803, device='cuda:0')\n",
1433
+ "This is e_latent_loss tensor(2.0451, device='cuda:0')\n",
1434
+ "57\n",
1435
+ "[0]\n",
1436
+ "this is q_latent_loss tensor(9.9105, device='cuda:0')\n",
1437
+ "This is e_latent_loss tensor(2.4776, device='cuda:0')\n",
1438
+ "57\n",
1439
+ "[0]\n"
1440
+ ]
1441
+ },
1442
+ {
1443
+ "name": "stderr",
1444
+ "output_type": "stream",
1445
+ "text": [
1446
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1447
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1448
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1449
+ ]
1450
+ },
1451
+ {
1452
+ "name": "stdout",
1453
+ "output_type": "stream",
1454
+ "text": [
1455
+ "this is q_latent_loss tensor(8.8691, device='cuda:0')\n",
1456
+ "This is e_latent_loss tensor(2.2173, device='cuda:0')\n",
1457
+ "57\n",
1458
+ "[0]\n",
1459
+ "this is q_latent_loss tensor(7.4474, device='cuda:0')\n",
1460
+ "This is e_latent_loss tensor(1.8619, device='cuda:0')\n",
1461
+ "57\n",
1462
+ "[0]\n",
1463
+ "this is q_latent_loss tensor(9.4359, device='cuda:0')\n",
1464
+ "This is e_latent_loss tensor(2.3590, device='cuda:0')\n",
1465
+ "57\n",
1466
+ "[0]\n",
1467
+ "Processed 130/21213 images\n"
1468
+ ]
1469
+ },
1470
+ {
1471
+ "name": "stderr",
1472
+ "output_type": "stream",
1473
+ "text": [
1474
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1475
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1476
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1477
+ ]
1478
+ },
1479
+ {
1480
+ "name": "stdout",
1481
+ "output_type": "stream",
1482
+ "text": [
1483
+ "this is q_latent_loss tensor(6.5443, device='cuda:0')\n",
1484
+ "This is e_latent_loss tensor(1.6361, device='cuda:0')\n",
1485
+ "57\n",
1486
+ "[0]\n",
1487
+ "this is q_latent_loss tensor(7.5185, device='cuda:0')\n",
1488
+ "This is e_latent_loss tensor(1.8796, device='cuda:0')\n",
1489
+ "57\n",
1490
+ "[0]\n",
1491
+ "this is q_latent_loss tensor(15.6529, device='cuda:0')\n",
1492
+ "This is e_latent_loss tensor(3.9132, device='cuda:0')\n",
1493
+ "57\n",
1494
+ "[0]\n"
1495
+ ]
1496
+ },
1497
+ {
1498
+ "name": "stderr",
1499
+ "output_type": "stream",
1500
+ "text": [
1501
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1502
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1503
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1504
+ ]
1505
+ },
1506
+ {
1507
+ "name": "stdout",
1508
+ "output_type": "stream",
1509
+ "text": [
1510
+ "this is q_latent_loss tensor(11.5756, device='cuda:0')\n",
1511
+ "This is e_latent_loss tensor(2.8939, device='cuda:0')\n",
1512
+ "57\n",
1513
+ "[0]\n",
1514
+ "this is q_latent_loss tensor(12.6367, device='cuda:0')\n",
1515
+ "This is e_latent_loss tensor(3.1592, device='cuda:0')\n",
1516
+ "57\n",
1517
+ "[0]\n",
1518
+ "this is q_latent_loss tensor(10.2915, device='cuda:0')\n",
1519
+ "This is e_latent_loss tensor(2.5729, device='cuda:0')\n",
1520
+ "57\n",
1521
+ "[0]\n"
1522
+ ]
1523
+ },
1524
+ {
1525
+ "name": "stderr",
1526
+ "output_type": "stream",
1527
+ "text": [
1528
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1529
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1530
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1531
+ ]
1532
+ },
1533
+ {
1534
+ "name": "stdout",
1535
+ "output_type": "stream",
1536
+ "text": [
1537
+ "this is q_latent_loss tensor(9.9234, device='cuda:0')\n",
1538
+ "This is e_latent_loss tensor(2.4809, device='cuda:0')\n",
1539
+ "57\n",
1540
+ "[0]\n",
1541
+ "this is q_latent_loss tensor(12.2591, device='cuda:0')\n",
1542
+ "This is e_latent_loss tensor(3.0648, device='cuda:0')\n",
1543
+ "57\n",
1544
+ "[0]\n",
1545
+ "this is q_latent_loss tensor(9.1375, device='cuda:0')\n",
1546
+ "This is e_latent_loss tensor(2.2844, device='cuda:0')\n",
1547
+ "57\n",
1548
+ "[0]\n"
1549
+ ]
1550
+ },
1551
+ {
1552
+ "name": "stderr",
1553
+ "output_type": "stream",
1554
+ "text": [
1555
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1556
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1557
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1558
+ ]
1559
+ },
1560
+ {
1561
+ "name": "stdout",
1562
+ "output_type": "stream",
1563
+ "text": [
1564
+ "this is q_latent_loss tensor(8.9599, device='cuda:0')\n",
1565
+ "This is e_latent_loss tensor(2.2400, device='cuda:0')\n",
1566
+ "57\n",
1567
+ "[0]\n",
1568
+ "Processed 140/21213 images\n",
1569
+ "this is q_latent_loss tensor(10.5039, device='cuda:0')\n",
1570
+ "This is e_latent_loss tensor(2.6260, device='cuda:0')\n",
1571
+ "57\n",
1572
+ "[0]\n",
1573
+ "this is q_latent_loss tensor(9.1007, device='cuda:0')\n",
1574
+ "This is e_latent_loss tensor(2.2752, device='cuda:0')\n",
1575
+ "57\n",
1576
+ "[0]\n"
1577
+ ]
1578
+ },
1579
+ {
1580
+ "name": "stderr",
1581
+ "output_type": "stream",
1582
+ "text": [
1583
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1584
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1585
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1586
+ ]
1587
+ },
1588
+ {
1589
+ "name": "stdout",
1590
+ "output_type": "stream",
1591
+ "text": [
1592
+ "this is q_latent_loss tensor(13.8705, device='cuda:0')\n",
1593
+ "This is e_latent_loss tensor(3.4676, device='cuda:0')\n",
1594
+ "57\n",
1595
+ "[0]\n",
1596
+ "this is q_latent_loss tensor(10.4230, device='cuda:0')\n",
1597
+ "This is e_latent_loss tensor(2.6058, device='cuda:0')\n",
1598
+ "57\n",
1599
+ "[0]\n",
1600
+ "this is q_latent_loss tensor(10.0464, device='cuda:0')\n",
1601
+ "This is e_latent_loss tensor(2.5116, device='cuda:0')\n",
1602
+ "57\n",
1603
+ "[0]\n"
1604
+ ]
1605
+ },
1606
+ {
1607
+ "name": "stderr",
1608
+ "output_type": "stream",
1609
+ "text": [
1610
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1611
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1612
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1613
+ ]
1614
+ },
1615
+ {
1616
+ "name": "stdout",
1617
+ "output_type": "stream",
1618
+ "text": [
1619
+ "this is q_latent_loss tensor(10.8465, device='cuda:0')\n",
1620
+ "This is e_latent_loss tensor(2.7116, device='cuda:0')\n",
1621
+ "57\n",
1622
+ "[0]\n",
1623
+ "this is q_latent_loss tensor(9.8604, device='cuda:0')\n",
1624
+ "This is e_latent_loss tensor(2.4651, device='cuda:0')\n",
1625
+ "57\n",
1626
+ "[0]\n",
1627
+ "this is q_latent_loss tensor(7.0887, device='cuda:0')\n",
1628
+ "This is e_latent_loss tensor(1.7722, device='cuda:0')\n",
1629
+ "57\n",
1630
+ "[0]\n"
1631
+ ]
1632
+ },
1633
+ {
1634
+ "name": "stderr",
1635
+ "output_type": "stream",
1636
+ "text": [
1637
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1638
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1639
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1640
+ ]
1641
+ },
1642
+ {
1643
+ "name": "stdout",
1644
+ "output_type": "stream",
1645
+ "text": [
1646
+ "this is q_latent_loss tensor(9.0656, device='cuda:0')\n",
1647
+ "This is e_latent_loss tensor(2.2664, device='cuda:0')\n",
1648
+ "57\n",
1649
+ "[0]\n",
1650
+ "this is q_latent_loss tensor(8.3511, device='cuda:0')\n",
1651
+ "This is e_latent_loss tensor(2.0878, device='cuda:0')\n",
1652
+ "57\n",
1653
+ "[0]\n",
1654
+ "Processed 150/21213 images\n",
1655
+ "this is q_latent_loss tensor(11.1818, device='cuda:0')\n",
1656
+ "This is e_latent_loss tensor(2.7954, device='cuda:0')\n",
1657
+ "57\n",
1658
+ "[0]\n"
1659
+ ]
1660
+ },
1661
+ {
1662
+ "name": "stderr",
1663
+ "output_type": "stream",
1664
+ "text": [
1665
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1666
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1667
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1668
+ ]
1669
+ },
1670
+ {
1671
+ "name": "stdout",
1672
+ "output_type": "stream",
1673
+ "text": [
1674
+ "this is q_latent_loss tensor(7.8044, device='cuda:0')\n",
1675
+ "This is e_latent_loss tensor(1.9511, device='cuda:0')\n",
1676
+ "57\n",
1677
+ "[0]\n",
1678
+ "this is q_latent_loss tensor(8.0274, device='cuda:0')\n",
1679
+ "This is e_latent_loss tensor(2.0068, device='cuda:0')\n",
1680
+ "57\n",
1681
+ "[0]\n",
1682
+ "this is q_latent_loss tensor(8.4584, device='cuda:0')\n",
1683
+ "This is e_latent_loss tensor(2.1146, device='cuda:0')\n",
1684
+ "57\n",
1685
+ "[0]\n"
1686
+ ]
1687
+ },
1688
+ {
1689
+ "name": "stderr",
1690
+ "output_type": "stream",
1691
+ "text": [
1692
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1693
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1694
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1695
+ ]
1696
+ },
1697
+ {
1698
+ "name": "stdout",
1699
+ "output_type": "stream",
1700
+ "text": [
1701
+ "this is q_latent_loss tensor(8.1515, device='cuda:0')\n",
1702
+ "This is e_latent_loss tensor(2.0379, device='cuda:0')\n",
1703
+ "57\n",
1704
+ "[0]\n",
1705
+ "this is q_latent_loss tensor(9.8069, device='cuda:0')\n",
1706
+ "This is e_latent_loss tensor(2.4517, device='cuda:0')\n",
1707
+ "57\n",
1708
+ "[0]\n",
1709
+ "this is q_latent_loss tensor(10.3290, device='cuda:0')\n",
1710
+ "This is e_latent_loss tensor(2.5823, device='cuda:0')\n",
1711
+ "57\n",
1712
+ "[0]\n"
1713
+ ]
1714
+ },
1715
+ {
1716
+ "name": "stderr",
1717
+ "output_type": "stream",
1718
+ "text": [
1719
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
1720
+ "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
1721
+ ]
1722
+ }
1723
+ ],
1724
+ "source": [
1725
+ "from PIL import Image\n",
1726
+ "import torch\n",
1727
+ "import os\n",
1728
+ "import json\n",
1729
+ "import re\n",
1730
+ "\n",
1731
+ "# 假设 processor 和 model 已经初始化好了\n",
1732
+ "# 如果没有,需要添加相关的初始化代码\n",
1733
+ "\n",
1734
+ "def get_category_from_path(path):\n",
1735
+ " \"\"\"从路径中提取类别\"\"\"\n",
1736
+ " if \"porn\" in path:\n",
1737
+ " return \"porn\"\n",
1738
+ " elif \"blood\" in path:\n",
1739
+ " return \"blood\"\n",
1740
+ " elif \"gun\" in path:\n",
1741
+ " return \"gun\"\n",
1742
+ " elif \"insulting_gesture\" in path:\n",
1743
+ " return \"insulting_gesture\"\n",
1744
+ " elif \"alcohol\" in path:\n",
1745
+ " return \"alcohol\"\n",
1746
+ " elif \"cigarette\" in path:\n",
1747
+ " return \"cigarette\"\n",
1748
+ " elif \"knife\" in path:\n",
1749
+ " return \"knife\"\n",
1750
+ " elif \"real_neutral\" in path:\n",
1751
+ " return \"neutral\"\n",
1752
+ " else:\n",
1753
+ " return \"unknown\"\n",
1754
+ "\n",
1755
+ "def process_image(image_path, processor, model):\n",
1756
+ " \"\"\"处理图像并获取描述\"\"\"\n",
1757
+ " try:\n",
1758
+ " image = Image.open(image_path)\n",
1759
+ " \n",
1760
+ " conversation = [\n",
1761
+ " {\n",
1762
+ " \"role\": \"user\",\n",
1763
+ " \"content\": [\n",
1764
+ " {\"type\": \"text\", \"text\": \"Describe this image \"},\n",
1765
+ " {\"type\": \"image\"},\n",
1766
+ " ],\n",
1767
+ " }, \n",
1768
+ " ]\n",
1769
+ " \n",
1770
+ " prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)\n",
1771
+ " \n",
1772
+ " inputs = processor(images=image, text=prompt, return_tensors=\"pt\").to(model.device)\n",
1773
+ " \n",
1774
+ " with torch.no_grad():\n",
1775
+ " output = model.generate(**inputs, max_new_tokens=10) # 增加 token 数量以获取更详细的描述\n",
1776
+ " \n",
1777
+ " description = processor.decode(output[0], skip_special_tokens=True)\n",
1778
+ " \n",
1779
+ " # 清理描述文本,移除提示和可能的系统消息\n",
1780
+ " # 这部分可能需要根据实际输出进行调整\n",
1781
+ " description = re.sub(r\"Describe this image\\s*\", \"\", description)\n",
1782
+ " \n",
1783
+ " return description\n",
1784
+ " except ValueError as e:\n",
1785
+ " error_data = e.args[0] # 这将获取到 [indices, categories[0]]\n",
1786
+ " \n",
1787
+ " # 提取 indices 和 categories[0]\n",
1788
+ " if isinstance(error_data, list) and len(error_data) == 2:\n",
1789
+ " captured_indices = error_data[0]\n",
1790
+ " captured_category = error_data[1]\n",
1791
+ "\n",
1792
+ " return f\"Error processing image: {id2class[captured_category]} index {captured_indices}\"\n",
1793
+ "\n",
1794
+ "def main():\n",
1795
+ " image_folders = [\n",
1796
+ " \"/common/home/users/w/wzhao/llava_helper/nsfw_dataset_v1/porn/train\",\n",
1797
+ " \"/common/home/users/w/wzhao/llava_helper/nsfw_dataset_v1/porn/test\",\n",
1798
+ " \"/common/home/users/w/wzhao/llava_helper/dataset/class/blood/testing/jpg\",\n",
1799
+ " \"/common/home/users/w/wzhao/llava_helper/dataset/class/blood/training/jpg\",\n",
1800
+ " \"/common/home/users/w/wzhao/llava_helper/dataset/class/gun/testing/jpg\",\n",
1801
+ " \"/common/home/users/w/wzhao/llava_helper/dataset/class/gun/training/jpg\",\n",
1802
+ " \"/common/home/users/w/wzhao/llava_helper/dataset/class/insulting_gesture/training/jpg\",\n",
1803
+ " \"/common/home/users/w/wzhao/llava_helper/dataset/class/insulting_gesture/testing/jpg\",\n",
1804
+ " \"/common/home/users/w/wzhao/llava_helper/dataset/class/alcohol/testing/jpg\",\n",
1805
+ " \"/common/home/users/w/wzhao/llava_helper/dataset/class/alcohol/training/jpg\",\n",
1806
+ " \"/common/home/users/w/wzhao/llava_helper/dataset/class/cigarette/testing/jpg\",\n",
1807
+ " \"/common/home/users/w/wzhao/llava_helper/dataset/class/cigarette/training/jpg\",\n",
1808
+ " \"/common/home/users/w/wzhao/llava_helper/dataset/class/knife/testing/jpg\",\n",
1809
+ " \"/common/home/users/w/wzhao/llava_helper/dataset/class/knife/training/jpg\",\n",
1810
+ " \"/common/home/users/w/wzhao/llava_helper/real_neutral\"\n",
1811
+ " ]\n",
1812
+ " \n",
1813
+ " results = []\n",
1814
+ " total_images = 0\n",
1815
+ " \n",
1816
+ " # 计算总图片数量\n",
1817
+ " for folder in image_folders:\n",
1818
+ " if os.path.exists(folder):\n",
1819
+ " for root, _, files in os.walk(folder):\n",
1820
+ " total_images += sum(1 for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg')))\n",
1821
+ " \n",
1822
+ " print(f\"Total images to process: {total_images}\")\n",
1823
+ " \n",
1824
+ " processed_count = 0\n",
1825
+ " \n",
1826
+ " # 处理每个文件夹中的图片\n",
1827
+ " for folder in image_folders:\n",
1828
+ " if not os.path.exists(folder):\n",
1829
+ " print(f\"Folder not found: {folder}\")\n",
1830
+ " continue\n",
1831
+ " \n",
1832
+ " category = get_category_from_path(folder)\n",
1833
+ " print(f\"Processing folder: {folder} (Category: {category})\")\n",
1834
+ " \n",
1835
+ " for root, _, files in os.walk(folder):\n",
1836
+ " for file in files:\n",
1837
+ " if file.lower().endswith(('.png', '.jpg', '.jpeg')):\n",
1838
+ " image_path = os.path.join(root, file)\n",
1839
+ " \n",
1840
+ " # 处理图像\n",
1841
+ " description = process_image(image_path, processor, model)\n",
1842
+ " \n",
1843
+ " # 存储结果\n",
1844
+ " results.append({\n",
1845
+ " \"path\": image_path,\n",
1846
+ " \"response\": description,\n",
1847
+ " \"category\": category\n",
1848
+ " })\n",
1849
+ " \n",
1850
+ " processed_count += 1\n",
1851
+ " if processed_count % 10 == 0:\n",
1852
+ " print(f\"Processed {processed_count}/{total_images} images\")\n",
1853
+ " \n",
1854
+ " # 每处理100张图片保存一次中间结果\n",
1855
+ " if processed_count % 100 == 0:\n",
1856
+ " with open(f\"/common/home/users/w/wzhao/vqclip/json_results_newest/image_descriptions_checkpoint_{processed_count}.json\", \"w\") as f:\n",
1857
+ " json.dump(results, f, indent=2)\n",
1858
+ " \n",
1859
+ " # 保存最终结果\n",
1860
+ " with open(\"/common/home/users/w/wzhao/vqclip/json_results_newest/image_descriptions_complete.json\", \"w\") as f:\n",
1861
+ " json.dump(results, f, indent=2)\n",
1862
+ " \n",
1863
+ " print(f\"Processing complete. Results saved to image_descriptions_complete.json\")\n",
1864
+ "\n",
1865
+ "if __name__ == \"__main__\":\n",
1866
+ " # 确保已经初始化了processor和model\n",
1867
+ " # 如果没有,需要在这里添加初始化代码\n",
1868
+ " main()"
1869
+ ]
1870
+ },
1871
+ {
1872
+ "cell_type": "code",
1873
+ "execution_count": null,
1874
+ "metadata": {},
1875
+ "outputs": [
1876
+ {
1877
+ "data": {
1878
+ "text/plain": [
1879
+ "array(['54', 'porn'], dtype='<U21')"
1880
+ ]
1881
+ },
1882
+ "execution_count": 4,
1883
+ "metadata": {},
1884
+ "output_type": "execute_result"
1885
+ }
1886
+ ],
1887
+ "source": [
1888
+ "import numpy as np\n",
1889
+ "np.load(\"/common/home/users/w/wzhao/vqclip/classified_results/1.npy\")"
1890
+ ]
1891
+ },
1892
+ {
1893
+ "cell_type": "code",
1894
+ "execution_count": null,
1895
+ "metadata": {},
1896
+ "outputs": [
1897
+ {
1898
+ "name": "stdout",
1899
+ "output_type": "stream",
1900
+ "text": [
1901
+ "**************************************************************\n",
1902
+ "torch.Size([1, 3, 336, 336])\n",
1903
+ "this is q_latent_loss tensor(0.6994, device='cuda:0')\n",
1904
+ "This is e_latent_loss tensor(0.1748, device='cuda:0')\n",
1905
+ "34\n",
1906
+ "[1]\n",
1907
+ "1\n",
1908
+ "检测到不适当内容,生成已停止\n",
1909
+ "**************************************************************\n"
1910
+ ]
1911
+ }
1912
+ ],
1913
+ "source": [
1914
+ "from PIL import Image\n",
1915
+ "import time\n",
1916
+ "image = Image.open('/common/home/users/w/wzhao/qllava/00008.jpg')\n",
1917
+ "#image = Image.open('/common/home/users/w/wzhao/qllava/94915.jpg')\n",
1918
+ "#image = Image.open(\"\")\n",
1919
+ "# Define a chat histiry and use `apply_chat_template` to get correctly formatted prompt\n",
1920
+ "# Each value in \"content\" has to be a list of dicts with types (\"text\", \"image\") \n",
1921
+ "print(\"**************************************************************\")\n",
1922
+ "\n",
1923
+ "\n",
1924
+ "conversation = [\n",
1925
+ " {\n",
1926
+ " \"role\": \"user\",\n",
1927
+ " \"content\": [\n",
1928
+ " {\"type\": \"text\", \"text\": \"Describe this image \"},\n",
1929
+ " {\"type\": \"image\"},\n",
1930
+ " ],\n",
1931
+ " }, \n",
1932
+ "]\n",
1933
+ "\n",
1934
+ "prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)\n",
1935
+ "#print(prompt)\n",
1936
+ "\n",
1937
+ "inputs = processor(images=image, text=prompt, return_tensors=\"pt\").to(model.device)\n",
1938
+ "print(inputs[\"pixel_values\"].shape)\n",
1939
+ "\n",
1940
+ "# autoregressively complete prompt\n",
1941
+ "try:\n",
1942
+ " output = model.generate(**inputs, max_new_tokens=50)\n",
1943
+ " print(processor.decode(output[0], skip_special_tokens=True))\n",
1944
+ "except ValueError as e:\n",
1945
+ " # 记录错误信息\n",
1946
+ " \n",
1947
+ " harm_class = int(str(e)))\n",
1948
+ "\n",
1949
+ " \n",
1950
+ " # 可以选择继续执行其他代码或者返回一个预设的安全输出\n",
1951
+ " print(\"检测到不适当内容,生成已停止\")\n",
1952
+ "print(\"**************************************************************\")\n",
1953
+ "#time.sleep(100)"
1954
+ ]
1955
+ },
1956
+ {
1957
+ "cell_type": "code",
1958
+ "execution_count": null,
1959
+ "metadata": {},
1960
+ "outputs": [
1961
+ {
1962
+ "data": {
1963
+ "text/plain": [
1964
+ "LlavaForConditionalGeneration(\n",
1965
+ " (vision_tower): CLIPVisionModel(\n",
1966
+ " (vision_model): CLIPVisionTransformer(\n",
1967
+ " (embeddings): CLIPVisionEmbeddings(\n",
1968
+ " (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)\n",
1969
+ " (position_embedding): Embedding(577, 1024)\n",
1970
+ " )\n",
1971
+ " (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
1972
+ " (encoder): CLIPEncoder(\n",
1973
+ " (layers): ModuleList(\n",
1974
+ " (0-23): 24 x CLIPEncoderLayer(\n",
1975
+ " (self_attn): CLIPSdpaAttention(\n",
1976
+ " (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
1977
+ " (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
1978
+ " (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
1979
+ " (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
1980
+ " )\n",
1981
+ " (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
1982
+ " (mlp): CLIPMLP(\n",
1983
+ " (activation_fn): QuickGELUActivation()\n",
1984
+ " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
1985
+ " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
1986
+ " )\n",
1987
+ " (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
1988
+ " )\n",
1989
+ " )\n",
1990
+ " )\n",
1991
+ " (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
1992
+ " )\n",
1993
+ " )\n",
1994
+ " (multi_modal_projector): LlavaMultiModalProjector(\n",
1995
+ " (linear_1): Linear(in_features=1024, out_features=4096, bias=True)\n",
1996
+ " (act): GELUActivation()\n",
1997
+ " (linear_2): Linear(in_features=4096, out_features=4096, bias=True)\n",
1998
+ " (vq): VectorQuantizer(\n",
1999
+ " (embedding): Embedding(16000, 4096)\n",
2000
+ " )\n",
2001
+ " (vq_cls): VectorQuantizerCLS(\n",
2002
+ " (embedding): Embedding(128, 4096)\n",
2003
+ " )\n",
2004
+ " )\n",
2005
+ " (language_model): LlamaForCausalLM(\n",
2006
+ " (model): LlamaModel(\n",
2007
+ " (embed_tokens): Embedding(32064, 4096)\n",
2008
+ " (layers): ModuleList(\n",
2009
+ " (0-31): 32 x LlamaDecoderLayer(\n",
2010
+ " (self_attn): LlamaSdpaAttention(\n",
2011
+ " (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
2012
+ " (k_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
2013
+ " (v_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
2014
+ " (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n",
2015
+ " (rotary_emb): LlamaRotaryEmbedding()\n",
2016
+ " )\n",
2017
+ " (mlp): LlamaMLP(\n",
2018
+ " (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)\n",
2019
+ " (up_proj): Linear(in_features=4096, out_features=11008, bias=False)\n",
2020
+ " (down_proj): Linear(in_features=11008, out_features=4096, bias=False)\n",
2021
+ " (act_fn): SiLU()\n",
2022
+ " )\n",
2023
+ " (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)\n",
2024
+ " (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)\n",
2025
+ " )\n",
2026
+ " )\n",
2027
+ " (norm): LlamaRMSNorm((4096,), eps=1e-05)\n",
2028
+ " (rotary_emb): LlamaRotaryEmbedding()\n",
2029
+ " )\n",
2030
+ " (lm_head): Linear(in_features=4096, out_features=32064, bias=False)\n",
2031
+ " )\n",
2032
+ ")"
2033
+ ]
2034
+ },
2035
+ "execution_count": 10,
2036
+ "metadata": {},
2037
+ "output_type": "execute_result"
2038
+ }
2039
+ ],
2040
+ "source": [
2041
+ "model"
2042
+ ]
2043
+ },
2044
+ {
2045
+ "cell_type": "code",
2046
+ "execution_count": null,
2047
+ "metadata": {},
2048
+ "outputs": [
2049
+ {
2050
+ "name": "stdout",
2051
+ "output_type": "stream",
2052
+ "text": [
2053
+ "[2025-03-28 01:00:17,048] [WARNING] [real_accelerator.py:174:get_accelerator] Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.\n",
2054
+ "[2025-03-28 01:00:17,049] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cpu (auto detect)\n"
2055
+ ]
2056
+ },
2057
+ {
2058
+ "name": "stderr",
2059
+ "output_type": "stream",
2060
+ "text": [
2061
+ "/opt/apps/software/Anaconda3/2024.06-1/compiler_compat/ld: cannot find -laio: No such file or directory\n",
2062
+ "collect2: error: ld returned 1 exit status\n"
2063
+ ]
2064
+ },
2065
+ {
2066
+ "ename": "AttributeError",
2067
+ "evalue": "'list' object has no attribute 'device'",
2068
+ "output_type": "error",
2069
+ "traceback": [
2070
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
2071
+ "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
2072
+ "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m/common/home/users/w/wzhao/vqclip/VQLLMfinal\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 3\u001b[0m \u001b[43m)\u001b[49m\n",
2073
+ "File \u001b[0;32m~/.local/lib/python3.12/site-packages/transformers/modeling_utils.py:2971\u001b[0m, in \u001b[0;36mPreTrainedModel.save_pretrained\u001b[0;34m(self, save_directory, is_main_process, state_dict, save_function, push_to_hub, max_shard_size, safe_serialization, variant, token, save_peft_format, **kwargs)\u001b[0m\n\u001b[1;32m 2968\u001b[0m weights_name \u001b[38;5;241m=\u001b[39m ADAPTER_SAFE_WEIGHTS_NAME \u001b[38;5;28;01mif\u001b[39;00m safe_serialization \u001b[38;5;28;01melse\u001b[39;00m ADAPTER_WEIGHTS_NAME\n\u001b[1;32m 2970\u001b[0m filename_pattern \u001b[38;5;241m=\u001b[39m weights_name\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.bin\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{suffix}\u001b[39;00m\u001b[38;5;124m.bin\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mreplace(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.safetensors\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{suffix}\u001b[39;00m\u001b[38;5;124m.safetensors\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m-> 2971\u001b[0m state_dict_split \u001b[38;5;241m=\u001b[39m \u001b[43msplit_torch_state_dict_into_shards\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2972\u001b[0m \u001b[43m \u001b[49m\u001b[43mstate_dict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilename_pattern\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilename_pattern\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_shard_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_shard_size\u001b[49m\n\u001b[1;32m 2973\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2974\u001b[0m \u001b[38;5;66;03m# Save index if sharded\u001b[39;00m\n\u001b[1;32m 2975\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
2074
+ "File \u001b[0;32m~/.local/lib/python3.12/site-packages/huggingface_hub/serialization/_torch.py:369\u001b[0m, in \u001b[0;36msplit_torch_state_dict_into_shards\u001b[0;34m(state_dict, filename_pattern, max_shard_size)\u001b[0m\n\u001b[1;32m 302\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msplit_torch_state_dict_into_shards\u001b[39m(\n\u001b[1;32m 303\u001b[0m state_dict: Dict[\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorch.Tensor\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 304\u001b[0m \u001b[38;5;241m*\u001b[39m,\n\u001b[1;32m 305\u001b[0m filename_pattern: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m constants\u001b[38;5;241m.\u001b[39mSAFETENSORS_WEIGHTS_FILE_PATTERN,\n\u001b[1;32m 306\u001b[0m max_shard_size: Union[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m MAX_SHARD_SIZE,\n\u001b[1;32m 307\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m StateDictSplit:\n\u001b[1;32m 308\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 309\u001b[0m \u001b[38;5;124;03m Split a model state dictionary in shards so that each shard is smaller than a given size.\u001b[39;00m\n\u001b[1;32m 310\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[38;5;124;03m ```\u001b[39;00m\n\u001b[1;32m 368\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 369\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msplit_state_dict_into_shards_factory\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 370\u001b[0m \u001b[43m \u001b[49m\u001b[43mstate_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 371\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_shard_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_shard_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 372\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename_pattern\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilename_pattern\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 373\u001b[0m \u001b[43m \u001b[49m\u001b[43mget_storage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mget_torch_storage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 374\u001b[0m \u001b[43m \u001b[49m\u001b[43mget_storage_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mget_torch_storage_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 375\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
2075
+ "File \u001b[0;32m~/.local/lib/python3.12/site-packages/huggingface_hub/serialization/_base.py:108\u001b[0m, in \u001b[0;36msplit_state_dict_into_shards_factory\u001b[0;34m(state_dict, get_storage_size, filename_pattern, get_storage_id, max_shard_size)\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[1;32m 107\u001b[0m \u001b[38;5;66;03m# If a `tensor` shares the same underlying storage as another tensor, we put `tensor` in the same `block`\u001b[39;00m\n\u001b[0;32m--> 108\u001b[0m storage_id \u001b[38;5;241m=\u001b[39m \u001b[43mget_storage_id\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtensor\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 109\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m storage_id \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 110\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m storage_id \u001b[38;5;129;01min\u001b[39;00m storage_id_to_tensors:\n\u001b[1;32m 111\u001b[0m \u001b[38;5;66;03m# We skip this tensor for now and will reassign to correct shard later\u001b[39;00m\n",
2076
+ "File \u001b[0;32m~/.local/lib/python3.12/site-packages/huggingface_hub/serialization/_torch.py:746\u001b[0m, in \u001b[0;36mget_torch_storage_id\u001b[0;34m(tensor)\u001b[0m\n\u001b[1;32m 735\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_torch_storage_id\u001b[39m(tensor: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorch.Tensor\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Optional[Tuple[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtorch.device\u001b[39m\u001b[38;5;124m\"\u001b[39m, Union[\u001b[38;5;28mint\u001b[39m, Tuple[Any, \u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m]], \u001b[38;5;28mint\u001b[39m]]:\n\u001b[1;32m 736\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 737\u001b[0m \u001b[38;5;124;03m Return unique identifier to a tensor storage.\u001b[39;00m\n\u001b[1;32m 738\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 744\u001b[0m \u001b[38;5;124;03m Taken from https://github.com/huggingface/transformers/blob/1ecf5f7c982d761b4daaa96719d162c324187c64/src/transformers/pytorch_utils.py#L278.\u001b[39;00m\n\u001b[1;32m 745\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 746\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mtensor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[38;5;241m.\u001b[39mtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmeta\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 747\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 748\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
2077
+ "\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'device'"
2078
+ ]
2079
+ }
2080
+ ],
2081
+ "source": [
2082
+ "model.save_pretrained(\n",
2083
+ " \"/common/home/users/w/wzhao/vqclip/VQLLMfinal\"\n",
2084
+ ")"
2085
+ ]
2086
+ },
2087
+ {
2088
+ "cell_type": "code",
2089
+ "execution_count": null,
2090
+ "metadata": {},
2091
+ "outputs": [
2092
+ {
2093
+ "name": "stdout",
2094
+ "output_type": "stream",
2095
+ "text": [
2096
+ "<class 'qllava_new.LlavaForConditionalGeneration'>\n"
2097
+ ]
2098
+ }
2099
+ ],
2100
+ "source": [
2101
+ "print(type(model))"
2102
+ ]
2103
+ },
2104
+ {
2105
+ "cell_type": "code",
2106
+ "execution_count": 2,
2107
+ "metadata": {},
2108
+ "outputs": [
2109
+ {
2110
+ "data": {
2111
+ "text/plain": [
2112
+ "array([56, 1])"
2113
+ ]
2114
+ },
2115
+ "execution_count": 2,
2116
+ "metadata": {},
2117
+ "output_type": "execute_result"
2118
+ }
2119
+ ],
2120
+ "source": [
2121
+ "import numpy as np\n",
2122
+ "np.load(\"/common/home/users/w/wzhao/vqclip/classified_results_llama2/1.npy\")"
2123
+ ]
2124
+ },
2125
+ {
2126
+ "cell_type": "code",
2127
+ "execution_count": 6,
2128
+ "metadata": {},
2129
+ "outputs": [
2130
+ {
2131
+ "name": "stdout",
2132
+ "output_type": "stream",
2133
+ "text": [
2134
+ "统计结果已保存到字典并导出到:\n",
2135
+ "- JSON: /common/home/users/w/wzhao/vqclip/classification_stats.json\n",
2136
+ "- Pickle: /common/home/users/w/wzhao/vqclip/classification_stats.pkl\n",
2137
+ "\n",
2138
+ "基本统计信息:\n",
2139
+ "总共发现 8 个不同的类别\n",
2140
+ "总共处理了 21213 个元素\n",
2141
+ "\n",
2142
+ "各类别的统计摘要:\n",
2143
+ "类别 0: 包含 5413 个元素,有 43 个不同的元素ID\n",
2144
+ " 出现频率最高的元素: ID 16: 615次, ID 60: 609次, ID 14: 566次\n",
2145
+ "类别 1: 包含 5554 个元素,有 4 个不同的元素ID\n",
2146
+ " 出现频率最高的元素: ID 56: 2067次, ID 7: 1430次, ID 34: 1180次\n",
2147
+ "类别 2: 包含 1473 个元素,有 3 个不同的元素ID\n",
2148
+ " 出现频率最高的元素: ID 13: 778次, ID 24: 488次, ID 23: 207次\n",
2149
+ "类别 3: 包含 2134 个元素,有 4 个不同的元素ID\n",
2150
+ " 出现频率最高的元素: ID 46: 840次, ID 33: 641次, ID 39: 351次\n",
2151
+ "类别 4: 包含 1416 个元素,有 2 个不同的元素ID\n",
2152
+ " 出现频率最高的元素: ID 22: 748次, ID 42: 668次\n",
2153
+ "类别 5: 包含 2785 个元素,有 2 个不同的元素ID\n",
2154
+ " 出现频率最高的元素: ID 52: 1654次, ID 4: 1131次\n",
2155
+ "类别 6: 包含 1723 个元素,有 5 个不同的元素ID\n",
2156
+ " 出现频率最高的元素: ID 20: 555次, ID 57: 481次, ID 17: 350次\n",
2157
+ "类别 7: 包含 715 个元素,有 1 个不同的元素ID\n",
2158
+ " 出现频率最高的元素: ID 40: 715次\n"
2159
+ ]
2160
+ }
2161
+ ],
2162
+ "source": [
2163
+ "import os\n",
2164
+ "import numpy as np\n",
2165
+ "import json\n",
2166
+ "from collections import defaultdict\n",
2167
+ "import pickle\n",
2168
+ "\n",
2169
+ "# 定义目录路径\n",
2170
+ "directory_path = '/common/home/users/w/wzhao/vqclip/classified_results_llama2'\n",
2171
+ "\n",
2172
+ "# 创建一个嵌套字典,用于存储每个类别中每个元素ID出现的次数\n",
2173
+ "class_element_counts = defaultdict(lambda: defaultdict(int))\n",
2174
+ "# 创建一个字典用于存储每个类别的总计数\n",
2175
+ "class_total_counts = defaultdict(int)\n",
2176
+ "\n",
2177
+ "# 遍历目录中的所有.npy文件\n",
2178
+ "try:\n",
2179
+ " for filename in os.listdir(directory_path):\n",
2180
+ " if filename.endswith('.npy'):\n",
2181
+ " file_path = os.path.join(directory_path, filename)\n",
2182
+ " \n",
2183
+ " # 加载.npy文件\n",
2184
+ " data = np.load(file_path)\n",
2185
+ " \n",
2186
+ " # 确保数据格式正确\n",
2187
+ " if data.size == 2:\n",
2188
+ " element_id = int(data[0]) # 确保是整数\n",
2189
+ " class_id = int(data[1]) # 确保是整数\n",
2190
+ " \n",
2191
+ " # 增加该元素在对应类别中的计数\n",
2192
+ " class_element_counts[class_id][element_id] += 1\n",
2193
+ " # 增加该类别的总计数\n",
2194
+ " class_total_counts[class_id] += 1\n",
2195
+ " else:\n",
2196
+ " print(f\"警告: 文件 {filename} 的数据格式不符合预期,已跳过\")\n",
2197
+ " \n",
2198
+ " # 将defaultdict转换为普通dict以便序列化\n",
2199
+ " result_dict = {\n",
2200
+ " \"class_totals\": dict(class_total_counts),\n",
2201
+ " \"class_elements\": {\n",
2202
+ " class_id: dict(elements) \n",
2203
+ " for class_id, elements in class_element_counts.items()\n",
2204
+ " }\n",
2205
+ " }\n",
2206
+ " \n",
2207
+ " # 保存结果到JSON文件\n",
2208
+ " output_json_path = os.path.join(os.path.dirname(directory_path), \"classification_stats.json\")\n",
2209
+ " with open(output_json_path, 'w') as f:\n",
2210
+ " json.dump(result_dict, f, indent=2)\n",
2211
+ " \n",
2212
+ " # 也保存为Python pickle格式,这样在后续Python处理中更方便\n",
2213
+ " output_pickle_path = os.path.join(os.path.dirname(directory_path), \"classification_stats.pkl\")\n",
2214
+ " with open(output_pickle_path, 'wb') as f:\n",
2215
+ " pickle.dump(result_dict, f)\n",
2216
+ " \n",
2217
+ " # 打印一些基本统计信息\n",
2218
+ " print(f\"统计结果已保存到字典并导出到:\")\n",
2219
+ " print(f\"- JSON: {output_json_path}\")\n",
2220
+ " print(f\"- Pickle: {output_pickle_path}\")\n",
2221
+ " print(\"\\n基本统计信息:\")\n",
2222
+ " print(f\"总共发现 {len(class_total_counts)} 个不同的类别\")\n",
2223
+ " total_elements = sum(class_total_counts.values())\n",
2224
+ " print(f\"总共处理了 {total_elements} 个元素\")\n",
2225
+ " \n",
2226
+ " # 打印每个类别的样本统计\n",
2227
+ " print(\"\\n各类别的统计摘要:\")\n",
2228
+ " for class_id in sorted(class_total_counts.keys()):\n",
2229
+ " print(f\"类别 {class_id}: 包含 {class_total_counts[class_id]} 个元素,有 {len(class_element_counts[class_id])} 个不同的元素ID\")\n",
2230
+ " \n",
2231
+ " # 获取该类别中出现次数最多的3个元素\n",
2232
+ " sorted_elements = sorted(class_element_counts[class_id].items(), \n",
2233
+ " key=lambda x: x[1], reverse=True)[:3]\n",
2234
+ " \n",
2235
+ " # 打印这些元素及其出现次数\n",
2236
+ " print(f\" 出现频率最高的元素: \" + \", \".join([f\"ID {e_id}: {count}次\" for e_id, count in sorted_elements]))\n",
2237
+ "\n",
2238
+ "except Exception as e:\n",
2239
+ " print(f\"发生错误: {e}\")"
2240
+ ]
2241
+ }
2242
+ ],
2243
+ "metadata": {
2244
+ "kernelspec": {
2245
+ "display_name": "Python 3 (ipykernel)",
2246
+ "language": "python",
2247
+ "name": "python3"
2248
+ },
2249
+ "language_info": {
2250
+ "codemirror_mode": {
2251
+ "name": "ipython",
2252
+ "version": 3
2253
+ },
2254
+ "file_extension": ".py",
2255
+ "mimetype": "text/x-python",
2256
+ "name": "python",
2257
+ "nbconvert_exporter": "python",
2258
+ "pygments_lexer": "ipython3",
2259
+ "version": "3.12.4"
2260
+ }
2261
+ },
2262
+ "nbformat": 4,
2263
+ "nbformat_minor": 2
2264
+ }