| { | |
| "model_type": "speaker_encoder", | |
| "architecture": "LSTM", | |
| "input_dim": 40, | |
| "hidden_dim": 256, | |
| "num_layers": 3, | |
| "output_dim": 256, | |
| "dropout": 0.1, | |
| "sample_rate": 16000, | |
| "window_size": 0.04, | |
| "window_stride": 0.01, | |
| "n_mels": 40, | |
| "embedding_size": 256, | |
| "prenet_dims": [256, 256], | |
| "lstm_dims": 256, | |
| "num_lstm_layers": 3, | |
| "speaker_embedding_size": 256, | |
| "use_cuda": true, | |
| "model_name": "speaker_encoder", | |
| "version": "1.0", | |
| "authors": ["Arjit"], | |
| "description": "Speaker encoder model for voice conversion tasks" | |
| } |