{ "_name_or_path": "naver-clova-ix/donut-base", "architectures": [ "VisionEncoderDecoderModel" ], "decoder": { "d_model": 1024, "decoder_attention_heads": 16, "decoder_ffn_dim": 4096, "decoder_layers": 4, "dropout": 0.1, "max_position_embeddings": 768, "model_type": "mbart", "torch_dtype": "float32", "vocab_size": 57612 }, "encoder": { "embed_dim": 128, "image_size": [ 1280, 960 ], "hidden_act": "gelu", "hidden_size": 1024, "layer_norm_eps": 1e-05, "model_type": "donut-swin", "num_channels": 3, "num_heads": [ 4, 8, 16, 32 ], "num_layers": 4, "patch_size": 4, "qkv_bias": true, "window_size": 10 }, "input_size": [ 1280, 960 ], "is_encoder_decoder": true, "max_length": 768, "model_type": "vision-encoder-decoder", "pad_token_id": 1, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.30.1" }