Text-to-Speech
Safetensors
English
Chinese
File size: 1,164 Bytes
9777ed1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
audio_tokenizer:
  mel_params:
    sample_rate: 16000
    n_fft: 1024
    win_length: 640
    hop_length: 320
    mel_fmin: 10 
    mel_fmax: null
    num_mels: 128

  encoder:
    input_channels: 1024
    vocos_dim: 384
    vocos_intermediate_dim: 2048
    vocos_num_layers: 12
    out_channels: 1024
    sample_ratios: [1,1]

  decoder:
    input_channel: 1024
    channels: 1536
    rates: [8, 5, 4, 2]
    kernel_sizes: [16,11,8,4]

  quantizer:
    input_dim: 1024
    codebook_size: 8192
    codebook_dim: 8
    commitment: 0.25
    codebook_loss_weight: 2.0
    use_l2_normlize: True
    threshold_ema_dead_code: 0.2
  
  speaker_encoder:
    input_dim: 128
    out_dim: 1024
    latent_dim: 128
    token_num: 32
    fsq_levels: [4, 4, 4, 4, 4, 4]
    fsq_num_quantizers: 1

  prenet:
    input_channels: 1024
    vocos_dim: 384
    vocos_intermediate_dim: 2048
    vocos_num_layers: 12
    out_channels: 1024
    condition_dim: 1024
    sample_ratios: [1,1]
    use_tanh_at_final: False

  postnet: 
    input_channels: 1024
    vocos_dim: 384
    vocos_intermediate_dim: 2048
    vocos_num_layers: 6
    out_channels: 1024
    use_tanh_at_final: False