Approximetal commited on
Commit
f36e46d
·
verified ·
1 Parent(s): e702978

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +116 -9
  2. app.py +19 -0
  3. apt.txt +3 -0
  4. gradio_mix.py +1209 -0
  5. inference_gradio.py +576 -0
  6. lemas_tts/__init__.py +6 -0
  7. lemas_tts/api.py +306 -0
  8. lemas_tts/configs/multilingual_grl.yaml +78 -0
  9. lemas_tts/configs/multilingual_prosody.yaml +78 -0
  10. lemas_tts/infer/edit_multilingual.py +184 -0
  11. lemas_tts/infer/frontend.py +251 -0
  12. lemas_tts/infer/infer_cli.py +386 -0
  13. lemas_tts/infer/text_norm/__init__.py +0 -0
  14. lemas_tts/infer/text_norm/cn_tn.py +824 -0
  15. lemas_tts/infer/text_norm/en_tn.py +178 -0
  16. lemas_tts/infer/text_norm/gp2py.py +148 -0
  17. lemas_tts/infer/text_norm/id_tn.py +275 -0
  18. lemas_tts/infer/text_norm/jieba_dict.txt +0 -0
  19. lemas_tts/infer/text_norm/pinyin-lexicon-r.txt +4120 -0
  20. lemas_tts/infer/text_norm/symbols.py +419 -0
  21. lemas_tts/infer/text_norm/tokenizer.py +235 -0
  22. lemas_tts/infer/text_norm/txt2pinyin.py +225 -0
  23. lemas_tts/infer/utils_infer.py +661 -0
  24. lemas_tts/model/backbones/README.md +20 -0
  25. lemas_tts/model/backbones/dit.py +254 -0
  26. lemas_tts/model/backbones/ecapa_tdnn.py +931 -0
  27. lemas_tts/model/backbones/mmdit.py +189 -0
  28. lemas_tts/model/backbones/prosody_encoder.py +433 -0
  29. lemas_tts/model/backbones/unett.py +250 -0
  30. lemas_tts/model/cfm.py +899 -0
  31. lemas_tts/model/modules.py +802 -0
  32. lemas_tts/model/utils.py +190 -0
  33. lemas_tts/scripts/inference_gradio.py +584 -0
  34. requirements.txt +185 -0
  35. uvr5/gui_data/constants.py +1147 -0
  36. uvr5/lib_v5/mdxnet.py +140 -0
  37. uvr5/lib_v5/mixer.ckpt +3 -0
  38. uvr5/lib_v5/modules.py +74 -0
  39. uvr5/lib_v5/pyrb.py +92 -0
  40. uvr5/lib_v5/spec_utils.py +703 -0
  41. uvr5/lib_v5/vr_network/__init__.py +1 -0
  42. uvr5/lib_v5/vr_network/layers.py +143 -0
  43. uvr5/lib_v5/vr_network/layers_new.py +126 -0
  44. uvr5/lib_v5/vr_network/model_param_init.py +59 -0
  45. uvr5/lib_v5/vr_network/modelparams/1band_sr16000_hl512.json +19 -0
  46. uvr5/lib_v5/vr_network/modelparams/1band_sr32000_hl512.json +19 -0
  47. uvr5/lib_v5/vr_network/modelparams/1band_sr33075_hl384.json +19 -0
  48. uvr5/lib_v5/vr_network/modelparams/1band_sr44100_hl1024.json +19 -0
  49. uvr5/lib_v5/vr_network/modelparams/1band_sr44100_hl256.json +19 -0
  50. uvr5/lib_v5/vr_network/modelparams/1band_sr44100_hl512.json +19 -0
README.md CHANGED
@@ -1,12 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
- title: LEMAS Edit
3
- emoji: 🏃
4
- colorFrom: red
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 6.2.0
8
- app_file: app.py
9
- pinned: false
 
 
 
 
 
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # LEMAS-TTS Gradio Demo (Hugging Face Space)
2
+
3
+ This folder is a **clean, inference-only** version of LEMAS-TTS, organized for easy deployment on **Hugging Face Spaces**.
4
+
5
+ It keeps only:
6
+ - the inference models & configs (`lemas_tts`)
7
+ - pretrained checkpoints and vocab (`pretrained_models`)
8
+ - the bundled UVR5 denoiser (`uvr5`)
9
+ - a Gradio web UI (`inference_gradio.py`, `app.py`)
10
+
11
+ ---
12
+
13
+ ## 1. Features
14
+
15
+ - Zero-shot TTS: clone voice from a reference audio + reference text
16
+ - Multilingual text input (Chinese / English / ES / IT / PT / DE, etc.)
17
+ - Optional UVR5-based reference denoising
18
+ - Two custom LEMAS checkpoints:
19
+ - `multilingual_prosody_custom`
20
+ - `multilingual_acc_grl_custom`
21
+
22
+ ---
23
+
24
+ ## 2. Project Structure
25
+
26
+ ```text
27
+ LEMAS-TTS_gradio/
28
+ app.py # HF Space entrypoint (Gradio Blocks)
29
+ inference_gradio.py # Full Gradio UI & logic
30
+ requirements.txt # Minimal runtime dependencies
31
+
32
+ lemas_tts/ # Core LEMAS-TTS package (inference only)
33
+ api.py # F5TTS API (used by the UI)
34
+ configs/ # Model configs (F5TTS / E2TTS)
35
+ infer/ # Inference utilities & text frontend
36
+ model/ # DiT backbone, utils, etc.
37
+
38
+ pretrained_models/ # All local assets needed for inference
39
+ ckpts/
40
+ F5TTS_v1_Base_vocos_custom_multilingual_prosody/model_2698000.pt
41
+ F5TTS_v1_Base_vocos_custom_multilingual_acc_grl/model_2680000.pt
42
+ prosody_encoder/...
43
+ vocos-mel-24khz/...
44
+ data/
45
+ multilingual_prosody_custom/vocab.txt
46
+ multilingual_acc_grl_custom/vocab.txt
47
+ test_examples/*.wav # Demo audios used in the UI
48
+ uvr5/
49
+ models/MDX_Net_Models/model_data/*.onnx, *.json
50
+
51
+ uvr5/ # Bundled UVR5 implementation for denoising
52
+ ```
53
+
54
+ `lemas_tts.api.F5TTS` automatically resolves `pretrained_models/` based on the repo layout, so no extra path configuration is required.
55
+
56
+ ---
57
+
58
+ ## 3. How to Run Locally
59
+
60
+ ```bash
61
+ cd LEMAS-TTS_gradio
62
+ pip install -r requirements.txt
63
+ python app.py
64
+ ```
65
+
66
+ Then open the printed URL (default `http://127.0.0.1:7860`) in your browser.
67
+
68
  ---
69
+
70
+ ## 4. Hugging Face Space Setup
71
+
72
+ 1. Create a new Space (type: **Gradio**).
73
+ 2. Upload the contents of `LEMAS-TTS_gradio/` to the Space repo:
74
+ - `app.py`
75
+ - `inference_gradio.py`
76
+ - `requirements.txt`
77
+ - `lemas_tts/`
78
+ - `pretrained_models/`
79
+ - `uvr5/`
80
+ 3. In the Space settings, choose a GPU hardware profile (the model is heavy).
81
+ 4. The Space will automatically run `app.py` and launch the Gradio Blocks named `app`.
82
+
83
+ No extra arguments are needed; all paths are relative inside the repo.
84
+
85
  ---
86
 
87
+ ## 5. Usage Tips
88
+
89
+ - **Reference Text** should match the reference audio roughly in content and language for best voice cloning.
90
+ - **Denoise**:
91
+ - Turn on if your reference audio is noisy; it runs UVR5 on CPU.
92
+ - Turn off if the reference is already clean (saves time).
93
+ - **Seed**:
94
+ - `-1` → random seed
95
+ - Any other integer → reproducible output
96
+
97
+ ---
98
+
99
+ ## 6. 中文说明(简要)
100
+
101
+ 这个目录是专门为 **Hugging Face Space** 打包的 **推理版 LEMAS-TTS**:
102
+
103
+ - 只保留推理相关代码(`lemas_tts`)、预训练模型(`pretrained_models`)和 UVR5 去噪模块(`uvr5`)
104
+ - Gradio 入口为 `app.py`,内部调用 `inference_gradio.py` 里的 `app`(一个 `gr.Blocks` 界面)
105
+ - `pretrained_models/` 下已经包含:
106
+ - 自定义多语种 prosody / accent GRL 的 finetune 权重
107
+ - vocoder(`vocos-mel-24khz`)
108
+ - prosody encoder
109
+ - 以及示例语音 `test_examples/*.wav`
110
+
111
+ 在本地或 Space 中运行步骤:
112
+
113
+ ```bash
114
+ pip install -r requirements.txt
115
+ python app.py
116
+ ```
117
+
118
+ 然后在浏览器中打开提示的链接即可使用零样本 TTS Demo。
119
+
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio entrypoint for Hugging Face Spaces for LEMAS-Edit.
3
+
4
+ This file exposes the Blocks app defined in `gradio_mix.get_app`.
5
+ """
6
+
7
+ import gradio as gr # noqa: F401
8
+
9
+ from gradio_mix import get_app
10
+
11
+ _app = get_app()
12
+
13
+ # Expose as both `app` and `demo` for maximum compatibility
14
+ app = _app
15
+ demo = _app
16
+
17
+
18
+ if __name__ == "__main__":
19
+ app.queue(api_open=True).launch()
apt.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ffmpeg
2
+ espeak-ng
3
+ espeak
gradio_mix.py ADDED
@@ -0,0 +1,1209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, gc
2
+ import re, time
3
+ import logging
4
+ from num2words import num2words
5
+ import gradio as gr
6
+ import torch, torchaudio
7
+ import numpy as np
8
+ import random
9
+ from scipy.io import wavfile
10
+ import onnx
11
+ import onnxruntime as ort
12
+ import copy
13
+ import uroman as ur
14
+ import jieba, zhconv
15
+ from pypinyin.core import Pinyin
16
+ from pypinyin import Style
17
+
18
+ from lemas_tts.api import TTS, PRETRAINED_ROOT, CKPTS_ROOT
19
+ from lemas_tts.infer.edit_multilingual import gen_wav_multilingual
20
+ from lemas_tts.infer.text_norm.txt2pinyin import (
21
+ MyConverter,
22
+ _PAUSE_SYMBOL,
23
+ change_tone_in_bu_or_yi,
24
+ get_phoneme_from_char_and_pinyin,
25
+ )
26
+ from lemas_tts.infer.text_norm.cn_tn import NSWNormalizer
27
+ # import io
28
+ # import uuid
29
+ _JIEBA_DICT = os.path.join(
30
+ os.path.dirname(__file__),
31
+ "lemas_tts",
32
+ "infer",
33
+ "text_norm",
34
+ "jieba_dict.txt",
35
+ )
36
+ if os.path.isfile(_JIEBA_DICT):
37
+ jieba.set_dictionary(_JIEBA_DICT)
38
+
39
+ # import sys
40
+ # sys.path.append("/cto_labs/vistring/zhaozhiyuan/code/SpeechAugment/versatile_audio_super_resolution")
41
+ # from inference import Predictor
42
+
43
+ # from inference_tts_scale import inference_one_sample as inference_tts
44
+ import langid
45
+ langid.set_languages(['es','pt','zh','en','de','fr','it', 'ar', 'ru', 'ja', 'ko', 'hi', 'th', 'id', 'vi'])
46
+
47
+ # import nltk
48
+ # nltk.download('punkt')
49
+
50
+ os.environ['CURL_CA_BUNDLE'] = ''
51
+ DEMO_PATH = os.getenv("DEMO_PATH", "./demo")
52
+ TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
53
+ MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
54
+
55
+ device = "cuda" if torch.cuda.is_available() else "cpu"
56
+ ASR_DEVICE = "cpu" # force whisperx/pyannote to CPU to avoid cuDNN issues
57
+ whisper_model, align_model = None, None
58
+ tts_edit_model = None
59
+
60
+ _whitespace_re = re.compile(r"\s+")
61
+ alpha_pattern = re.compile(r"[a-zA-Z]")
62
+
63
+ formatter = ("%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d || %(message)s")
64
+ logging.basicConfig(format=formatter, level=logging.INFO)
65
+
66
+ # def get_random_string():
67
+ # return "".join(str(uuid.uuid4()).split("-"))
68
+
69
+ def seed_everything(seed):
70
+ if seed != -1:
71
+ os.environ['PYTHONHASHSEED'] = str(seed)
72
+ random.seed(seed)
73
+ np.random.seed(seed)
74
+ torch.manual_seed(seed)
75
+ torch.cuda.manual_seed(seed)
76
+ torch.backends.cudnn.benchmark = False
77
+ torch.backends.cudnn.deterministic = True
78
+
79
+
80
+ # class AudioSR:
81
+ # def __init__(self, model_name):
82
+ # code_dir = "/cto_labs/vistring/zhaozhiyuan/code/SpeechAugment/versatile_audio_super_resolution"
83
+ # self.model = self.load_model(model_name, code_dir)
84
+ # self.sr = 48000
85
+ # self.chunk_size=10.24
86
+ # self.overlap=0.16
87
+ # self.guidance_scale=1
88
+ # self.ddim_steps=20
89
+ # self.multiband_ensemble=False
90
+
91
+ # def load_model(self, model_name, code_dir):
92
+ # import sys, json
93
+ # sys.path.append(code_dir)
94
+ # from inference import Predictor
95
+ # sr_model = Predictor()
96
+ # sr_model.setup(model_name)
97
+ # return sr_model
98
+
99
+ # def audiosr(self, in_wav, src_sr, tar_sr, chunk_size=10.24, overlap=0.16, seed=0, guidance_scale=1, ddim_steps=20, multiband_ensemble=False):
100
+ # if seed == 0:
101
+ # seed = random.randint(0, 2**32 - 1)
102
+ # print(f"Setting seed to: {seed}")
103
+ # print(f"overlap = {overlap}")
104
+ # print(f"guidance_scale = {guidance_scale}")
105
+ # print(f"ddim_steps = {ddim_steps}")
106
+ # print(f"chunk_size = {chunk_size}")
107
+ # print(f"multiband_ensemble = {multiband_ensemble}")
108
+ # print(f"in_wav.shape = {in_wav.shape}")
109
+
110
+ # in_wav = torchaudio.functional.resample(in_wav.squeeze(), src_sr, 24000)
111
+ # in_wav = in_wav.squeeze().numpy()
112
+
113
+ # out_wav = self.model.process_audio(
114
+ # in_wav, 24000,
115
+ # chunk_size=chunk_size,
116
+ # overlap=overlap,
117
+ # seed=seed,
118
+ # guidance_scale=guidance_scale,
119
+ # ddim_steps=ddim_steps,
120
+ # multiband_ensemble=multiband_ensemble,
121
+ # )
122
+ # out_wav = out_wav[:int(self.sr*in_wav.shape[0]/24000)].T
123
+ # if tar_sr != self.sr:
124
+ # out_wav = torchaudio.functional.resample(torch.from_numpy(out_wav).squeeze(), self.sr, tar_sr)
125
+ # else:
126
+ # out_wav = torch.from_numpy(out_wav)
127
+ # print(f"out.shape = {out_wav.shape} tar_sr={tar_sr}")
128
+ # return out_wav.squeeze()
129
+
130
+
131
+ class UVR5:
132
+ """Small wrapper around the bundled uvr5 implementation for denoising."""
133
+
134
+ def __init__(self, model_dir):
135
+ code_dir = os.path.join(os.path.dirname(__file__), "uvr5")
136
+ self.model = self.load_model(model_dir, code_dir)
137
+
138
+ def load_model(self, model_dir, code_dir):
139
+ import sys, json
140
+ if code_dir not in sys.path:
141
+ sys.path.append(code_dir)
142
+ from multiprocess_cuda_infer import ModelData, Inference
143
+ model_path = os.path.join(model_dir, "Kim_Vocal_1.onnx")
144
+ config_path = os.path.join(model_dir, "MDX-Net-Kim-Vocal1.json")
145
+ with open(config_path, "r", encoding="utf-8") as f:
146
+ configs = json.load(f)
147
+ model_data = ModelData(
148
+ model_path=model_path,
149
+ audio_path = model_dir,
150
+ result_path = model_dir,
151
+ device = 'cpu',
152
+ process_method = "MDX-Net",
153
+ base_dir=model_dir,
154
+ **configs
155
+ )
156
+
157
+ uvr5_model = Inference(model_data, 'cpu')
158
+ uvr5_model.load_model(model_path, 1)
159
+ return uvr5_model
160
+
161
+ def denoise(self, audio_info):
162
+ input_audio = load_wav(audio_info, sr=44100, channel=2)
163
+ output_audio = self.model.demix_base({0:input_audio.squeeze()}, is_match_mix=False)
164
+ # transform = torchaudio.transforms.Resample(44100, 16000)
165
+ # output_audio = transform(output_audio)
166
+ return output_audio.squeeze().T.numpy(), 44100
167
+
168
+
169
+ class DeepFilterNet:
170
+ def __init__(self, model_path):
171
+ self.hop_size = 480
172
+ self.fft_size = 960
173
+ self.model = self.load_model(model_path)
174
+
175
+
176
+ def load_model(self, model_path, threads=1):
177
+ sess_options = ort.SessionOptions()
178
+ sess_options.intra_op_num_threads = threads
179
+ sess_options.graph_optimization_level = (ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED)
180
+ sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
181
+
182
+ model = onnx.load_model(model_path)
183
+ ort_session = ort.InferenceSession(
184
+ model.SerializeToString(),
185
+ sess_options,
186
+ providers=["CPUExecutionProvider"], # ["CUDAExecutionProvider"], #
187
+ )
188
+
189
+ input_names = ["input_frame", "states", "atten_lim_db"]
190
+ output_names = ["enhanced_audio_frame", "new_states", "lsnr"]
191
+ return ort_session
192
+
193
+
194
+ def denoise(self, audio_info):
195
+ wav = load_wav(audio_info, 48000)
196
+ orig_len = wav.shape[-1]
197
+ hop_size_divisible_padding_size = (self.hop_size - orig_len % self.hop_size) % self.hop_size
198
+ orig_len += hop_size_divisible_padding_size
199
+ wav = torch.nn.functional.pad(
200
+ wav, (0, self.fft_size + hop_size_divisible_padding_size)
201
+ )
202
+ chunked_audio = torch.split(wav, self.hop_size)
203
+ # chunked_audio = torch.split(wav, int(wav.shape[-1]/2))
204
+
205
+ state = np.zeros(45304,dtype=np.float32)
206
+ atten_lim_db = np.zeros(1,dtype=np.float32)
207
+ enhanced = []
208
+ for frame in chunked_audio:
209
+ out = self.model.run(None,input_feed={"input_frame":frame.numpy(),"states":state,"atten_lim_db":atten_lim_db})
210
+ enhanced.append(torch.tensor(out[0]))
211
+ state = out[1]
212
+
213
+ enhanced_audio = torch.cat(enhanced).unsqueeze(0) # [t] -> [1, t] typical mono format
214
+
215
+ d = self.fft_size - self.hop_size
216
+ enhanced_audio = enhanced_audio[:, d: orig_len + d]
217
+
218
+ return enhanced_audio.squeeze().numpy(), 48000
219
+
220
+
221
+ class TextNorm():
222
+ def __init__(self):
223
+ my_pinyin = Pinyin(MyConverter())
224
+ self.pinyin_parser = my_pinyin.pinyin
225
+
226
+ def sil_type(self, time_s):
227
+ if round(time_s) < 0.4:
228
+ return ""
229
+ elif round(time_s) >= 0.4 and round(time_s) < 0.8:
230
+ return "#1"
231
+ elif round(time_s) >= 0.8 and round(time_s) < 1.5:
232
+ return "#2"
233
+ elif round(time_s) >= 1.5 and round(time_s) < 3.0:
234
+ return "#3"
235
+ elif round(time_s) >= 3.0:
236
+ return "#4"
237
+
238
+
239
+ def add_sil_raw(self, sub_list, start_time, end_time, target_transcript):
240
+ txt = []
241
+ txt_list = [x["word"] for x in sub_list]
242
+ sil = self.sil_type(sub_list[0]["start"])
243
+ if len(sil) > 0:
244
+ txt.append(sil)
245
+ txt.append(txt_list[0])
246
+ for i in range(1, len(sub_list)):
247
+ if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
248
+ txt.append(target_transcript)
249
+ target_transcript = ""
250
+ else:
251
+ sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
252
+ if len(sil) > 0:
253
+ txt.append(sil)
254
+ txt.append(txt_list[i])
255
+ return ' '.join(txt)
256
+
257
+ def add_sil(self, sub_list, start_time, end_time, target_transcript, src_lang, tar_lang):
258
+ txts = []
259
+ txt_list = [x["word"] for x in sub_list]
260
+ sil = self.sil_type(sub_list[0]["start"])
261
+ if len(sil) > 0:
262
+ txts.append([src_lang, sil])
263
+
264
+ if sub_list[0]["start"] < start_time:
265
+ txts.append([src_lang, txt_list[0]])
266
+ for i in range(1, len(sub_list)):
267
+ if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
268
+ txts.append([tar_lang, target_transcript])
269
+ target_transcript = ""
270
+ else:
271
+ sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
272
+ if len(sil) > 0:
273
+ txts.append([src_lang, sil])
274
+ txts.append([src_lang, txt_list[i]])
275
+
276
+ target_txt = [txts[0]]
277
+ for txt in txts[1:]:
278
+ if txt[1] == "":
279
+ continue
280
+ if txt[0] != target_txt[-1][0]:
281
+ target_txt.append([txt[0], ""])
282
+ target_txt[-1][-1] += " " + txt[1]
283
+
284
+ return target_txt
285
+
286
+
287
+ def get_prompt(self, sub_list, start_time, end_time, src_lang):
288
+ txts = []
289
+ txt_list = [x["word"] for x in sub_list]
290
+
291
+ if start_time <= sub_list[0]["start"]:
292
+ sil = self.sil_type(sub_list[0]["start"])
293
+ if len(sil) > 0:
294
+ txts.append([src_lang, sil])
295
+ txts.append([src_lang, txt_list[0]])
296
+
297
+ for i in range(1, len(sub_list)):
298
+ # if sub_list[i]["start"] <= start_time and sub_list[i]["end"] <= end_time:
299
+ # txts.append([tar_lang, target_transcript])
300
+ # target_transcript = ""
301
+ if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
302
+ sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
303
+ if len(sil) > 0:
304
+ txts.append([src_lang, sil])
305
+ txts.append([src_lang, txt_list[i]])
306
+
307
+ target_txt = [txts[0]]
308
+ for txt in txts[1:]:
309
+ if txt[1] == "":
310
+ continue
311
+ if txt[0] != target_txt[-1][0]:
312
+ target_txt.append([txt[0], ""])
313
+ target_txt[-1][-1] += " " + txt[1]
314
+ return target_txt
315
+
316
+
317
+ def txt2pinyin(self, text):
318
+ txts, phonemes = [], []
319
+ texts = re.split(r"(#\d)", text)
320
+ print("before norm: ", texts)
321
+ for text in texts:
322
+ if text in {'#1', '#2', '#3', '#4'}:
323
+ txts.append(text)
324
+ phonemes.append(text)
325
+ continue
326
+ text = NSWNormalizer(text.strip()).normalize()
327
+
328
+ text_list = list(jieba.cut(text))
329
+ print("jieba cut: ", text, text_list)
330
+ for words in text_list:
331
+ if words in _PAUSE_SYMBOL:
332
+ # phonemes.append('#2')
333
+ phonemes[-1] += _PAUSE_SYMBOL[words]
334
+ txts[-1] += words
335
+ elif re.search("[\u4e00-\u9fa5]+", words):
336
+ pinyin = self.pinyin_parser(words, style=Style.TONE3, errors="ignore")
337
+ new_pinyin = []
338
+ for x in pinyin:
339
+ x = "".join(x)
340
+ if "#" not in x:
341
+ new_pinyin.append(x)
342
+ else:
343
+ phonemes.append(words)
344
+ continue
345
+ new_pinyin = change_tone_in_bu_or_yi(words, new_pinyin) if len(words)>1 and words[-1] not in {"一","不"} else new_pinyin
346
+ phoneme = get_phoneme_from_char_and_pinyin(words, new_pinyin)
347
+ phonemes += phoneme
348
+ txts += list(words)
349
+ elif re.search(r"[a-zA-Z]", words) or re.search(r"#[1-4]", words):
350
+ phonemes.append(words)
351
+ txts.append(words)
352
+ # phonemes.append("#1")
353
+ # phones = " ".join(phonemes)
354
+ return txts, phonemes
355
+
356
+
357
+
358
+ def chunk_text(text, max_chars=135):
359
+ """
360
+ Splits the input text into chunks, each with a maximum number of characters.
361
+
362
+ Args:
363
+ text (str): The text to be split.
364
+ max_chars (int): The maximum number of characters per chunk.
365
+
366
+ Returns:
367
+ List[str]: A list of text chunks.
368
+ """
369
+ chunks = []
370
+ current_chunk = ""
371
+ # Split the text into sentences based on punctuation followed by whitespace
372
+ sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[;:,。!?])", text)
373
+
374
+ for sentence in sentences:
375
+ if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
376
+ current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
377
+ else:
378
+ if current_chunk:
379
+ chunks.append(current_chunk.strip())
380
+ current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
381
+
382
+ if current_chunk:
383
+ chunks.append(current_chunk.strip())
384
+
385
+ return chunks
386
+
387
+
388
+ class MMSAlignModel:
389
+ def __init__(self):
390
+ from torchaudio.pipelines import MMS_FA as bundle
391
+ self.mms_model = bundle.get_model()
392
+ self.mms_model.to(device)
393
+ self.mms_tokenizer = bundle.get_tokenizer()
394
+ self.mms_aligner = bundle.get_aligner()
395
+ self.text_normalizer = ur.Uroman()
396
+
397
+
398
+ def text_normalization(self, text_list):
399
+ text_normalized = []
400
+ for word in text_list:
401
+ text_char = ''
402
+ for c in word:
403
+ if c.isalpha() or c=="'":
404
+ text_char += c.lower()
405
+ elif c == "-":
406
+ text_char += '*'
407
+ text_char = text_char if len(text_char) > 0 else "*"
408
+ text_normalized.append(text_char)
409
+ assert len(text_normalized) == len(text_list), f"normalized text len != raw text len: {len(text_normalized)} != {text_list}"
410
+ return text_normalized
411
+
412
+ def compute_alignments(self, waveform: torch.Tensor, tokens):
413
+ with torch.inference_mode():
414
+ emission, _ = self.mms_model(waveform.to(device))
415
+ token_spans = self.mms_aligner(emission[0], tokens)
416
+ return emission, token_spans
417
+
418
+
419
+ def align(self, data, wav):
420
+ waveform = load_wav(wav, 16000).unsqueeze(0)
421
+ raw_text = data['text'][0]
422
+ text = " ".join(data['text'][1]).replace("-", " ")
423
+ text = re.sub("\s+", " ", text)
424
+ text_normed = self.text_normalizer.romanize_string(text, lcode=data["lang"])
425
+ # text_normed = re.sub("[\d_.,!$£%?#−/]", '', text_normed)
426
+ fliter = re.compile("[^a-z^*^'^ ]")
427
+ text_normed = fliter.sub('', text_normed.lower())
428
+ text_normed = re.sub("\s+", " ", text_normed)
429
+ text_normed = text_normed.split()
430
+ assert len(text_normed) == len(raw_text), f"normalized text len != raw text len: {len(text_normed)} != {len(raw_text)}"
431
+ tokens = self.mms_tokenizer(text_normed)
432
+ with torch.inference_mode():
433
+ emission, _ = self.mms_model(waveform.to(device))
434
+ token_spans = self.mms_aligner(emission[0], tokens)
435
+ num_frames = emission.size(1)
436
+ ratio = waveform.size(1) / num_frames
437
+ res = []
438
+ for i in range(len(token_spans)):
439
+ score = round(sum([x.score for x in token_spans[i]]) / len(token_spans[i]), ndigits=3)
440
+ start = round(waveform.size(-1) * token_spans[i][0].start / num_frames / 16000, ndigits=3)
441
+ end = round(waveform.size(-1) * token_spans[i][-1].end / num_frames / 16000, ndigits=3)
442
+ res.append({"word": raw_text[i], "start": start, "end": end, "score": score})
443
+
444
+ res = {"lang":data["lang"], "start": 0, "end": round(waveform.shape[-1]/16000, ndigits=3), "text_raw":data["text_raw"], "text": text, "words": res}
445
+ return res
446
+
447
+
448
+ class WhisperxModel:
449
+ def __init__(self, model_name):
450
+ from whisperx import load_model
451
+ from pathlib import Path
452
+ prompt = None # "This might be a blend of Simplified Chinese and English speech, do not translate, only transcription be allowed."
453
+
454
+ # Prefer a local VAD model (to avoid network download / 301 issues)
455
+ vad_fp = Path(MODELS_PATH) / "whisperx-vad-segmentation.bin"
456
+ if not vad_fp.is_file():
457
+ logging.warning(
458
+ "Local whisperx VAD not found at %s, falling back to default download path.",
459
+ vad_fp,
460
+ )
461
+ vad_fp = None
462
+
463
+ self.model = load_model(
464
+ model_name,
465
+ ASR_DEVICE,
466
+ compute_type="float32",
467
+ asr_options={
468
+ "suppress_numerals": True,
469
+ "max_new_tokens": None,
470
+ "clip_timestamps": None,
471
+ "initial_prompt": prompt,
472
+ "append_punctuations": ".。,,!!??::、",
473
+ "hallucination_silence_threshold": None,
474
+ "multilingual": True,
475
+ "hotwords": None
476
+ },
477
+ vad_model_fp=str(vad_fp) if vad_fp is not None else None,
478
+ )
479
+
480
+ def transcribe(self, audio_info, lang=None):
481
+ audio = load_wav(audio_info).numpy()
482
+ if lang is None:
483
+ lang = self.model.detect_language(audio)
484
+ if lang == 'zh':
485
+ self.model.options._replace(initial_prompt="简体中文:")
486
+ else:
487
+ self.model.options._replace(initial_prompt=None)
488
+ segments = self.model.transcribe(audio, batch_size=8, language=lang)["segments"]
489
+ transcript = " ".join([segment["text"] for segment in segments])
490
+
491
+ if lang not in {'es','pt','zh','en','de','fr','it', 'ar', 'ru', 'ja', 'ko', 'hi', 'th', 'id', 'vi'}:
492
+ lang = langid.classify(transcript)[0]
493
+ segments = self.model.transcribe(audio, batch_size=8, language=lang)["segments"]
494
+ transcript = " ".join([segment["text"] for segment in segments])
495
+ logging.debug(f"whisperx: {segments}")
496
+
497
+ transcript = zhconv.convert(transcript, 'zh-hans')
498
+ transcript = transcript.replace("-", " ")
499
+ transcript = re.sub(_whitespace_re, " ", transcript)
500
+ transcript = transcript[1:] if transcript[0] == " " else transcript
501
+ segments = {'lang':lang, 'text_raw':transcript}
502
+ if lang == "zh":
503
+ segments["text"] = text_norm.txt2pinyin(transcript)
504
+ else:
505
+ transcript = replace_numbers_with_words(transcript, lang=lang).split(' ')
506
+ segments["text"] = (transcript, transcript)
507
+
508
+ return align_model.align(segments, audio_info)
509
+
510
+
511
+ def load_wav(audio_info, sr=16000, channel=1):
512
+ raw_sr, audio = audio_info
513
+ audio = audio.T if len(audio.shape) > 1 and audio.shape[1] == 2 else audio
514
+ audio = audio / np.max(np.abs(audio))
515
+ audio = torch.from_numpy(audio).squeeze().float()
516
+ if channel == 1 and len(audio.shape) == 2: # stereo to mono
517
+ audio = audio.mean(dim=0, keepdim=True)
518
+ elif channel == 2 and len(audio.shape) == 1:
519
+ audio = torch.stack((audio, audio)) # mono to stereo
520
+ if raw_sr != sr:
521
+ audio = torchaudio.functional.resample(audio.squeeze(), raw_sr, sr)
522
+ audio = torch.clip(audio, -0.999, 0.999).squeeze()
523
+ return audio
524
+
525
+
526
+ def update_word_time(lst, cut_time, edit_start, edit_end):
527
+ for i in range(len(lst)):
528
+ lst[i]["start"] = round(lst[i]["start"] - cut_time, ndigits=3)
529
+ lst[i]["end"] = round(lst[i]["end"] - cut_time, ndigits=3)
530
+ edit_start = max(round(edit_start - cut_time, ndigits=3), 0)
531
+ edit_end = round(edit_end - cut_time, ndigits=3)
532
+ return lst, edit_start, edit_end
533
+
534
+
535
+ # def update_word_time2(lst, cut_time, edit_start, edit_end):
536
+ # for i in range(len(lst)):
537
+ # lst[i]["start"] = round(lst[i]["start"] + cut_time, ndigits=3)
538
+ # return lst, edit_start, edit_end
539
+
540
+
541
+ def get_audio_slice(audio, words_info, start_time, end_time, max_len=10, sr=16000, code_sr=50):
542
+ audio_dur = audio.shape[-1] / sr
543
+ sub_list = []
544
+ # 如果尾部小于5s则保留后面全部,并截取前半段音频
545
+ if audio_dur - end_time <= max_len/2:
546
+ for word in reversed(words_info):
547
+ if word['start'] > start_time or audio_dur - word['start'] < max_len:
548
+ sub_list = [word] + sub_list
549
+
550
+ # 如果头部小于5s则保留前面全部,并截取后半段音频
551
+ elif start_time <=max_len/2:
552
+ for word in words_info:
553
+ if word['end'] < max(end_time, max_len):
554
+ sub_list += [word]
555
+
556
+ # 如果前后都大于5s,则前后各留5s
557
+ else:
558
+ for word in words_info:
559
+ if word['start'] > start_time - max_len/2 and word['end'] < end_time + max_len/2:
560
+ sub_list += [word]
561
+ audio = audio.squeeze()
562
+
563
+ start = int(sub_list[0]['start']*sr)
564
+ end = int(sub_list[-1]['end']*sr)
565
+ # print("wav cuts:", start, end, (end-start) % int(sr/code_sr))
566
+ end -= (end-start) % int(sr/code_sr) # chunk取整
567
+
568
+ sub_list, start_time, end_time = update_word_time(sub_list, sub_list[0]['start'], start_time, end_time)
569
+ audio = audio.squeeze()
570
+ # print("after update_word_time:", sub_list, start_time, end_time, (end-start)/sr)
571
+
572
+ return (audio[:start], audio[start:end], audio[end:]), (sub_list, start_time, end_time)
573
+
574
+
575
+ def load_models(lemas_model_name, whisper_model_name, alignment_model_name, denoise_model_name): # , audiosr_name):
576
+
577
+ global transcribe_model, align_model, denoise_model, text_norm, tts_edit_model
578
+ # if voicecraft_model:
579
+ # del denoise_model
580
+ # del transcribe_model
581
+ # del align_model
582
+ # del voicecraft_model
583
+ # del audiosr
584
+ torch.cuda.empty_cache()
585
+ gc.collect()
586
+
587
+ if denoise_model_name == "UVR5":
588
+ denoise_model = UVR5(os.path.join(str(PRETRAINED_ROOT), "uvr5"))
589
+ elif denoise_model_name == "DeepFilterNet":
590
+ denoise_model = DeepFilterNet("./audio_preprocess/denoiser_model.onnx")
591
+
592
+ if alignment_model_name == "MMS":
593
+ align_model = MMSAlignModel()
594
+ else:
595
+ align_model = WhisperxAlignModel()
596
+
597
+ text_norm = TextNorm()
598
+
599
+ transcribe_model = WhisperxModel(whisper_model_name)
600
+
601
+ # Load LEMAS-TTS editing model (selected multilingual variant)
602
+ from pathlib import Path
603
+
604
+ ckpt_dir = Path(CKPTS_ROOT) / lemas_model_name
605
+ ckpt_candidates = sorted(
606
+ list(ckpt_dir.glob("*.safetensors")) + list(ckpt_dir.glob("*.pt"))
607
+ )
608
+ if not ckpt_candidates:
609
+ raise gr.Error(f"No LEMAS-TTS ckpt found under {ckpt_dir}")
610
+ ckpt_file = str(ckpt_candidates[-1])
611
+
612
+ vocab_file = Path(PRETRAINED_ROOT) / "data" / lemas_model_name / "vocab.txt"
613
+ if not vocab_file.is_file():
614
+ raise gr.Error(f"Vocab file not found: {vocab_file}")
615
+
616
+ prosody_cfg = Path(CKPTS_ROOT) / "prosody_encoder" / "pretssel_cfg.json"
617
+ prosody_ckpt = Path(CKPTS_ROOT) / "prosody_encoder" / "prosody_encoder_UnitY2.pt"
618
+ use_prosody = prosody_cfg.is_file() and prosody_ckpt.is_file()
619
+
620
+ tts_edit_model = TTS(
621
+ model=lemas_model_name,
622
+ ckpt_file=ckpt_file,
623
+ vocab_file=str(vocab_file),
624
+ device=device,
625
+ use_ema=True,
626
+ frontend="phone",
627
+ use_prosody_encoder=use_prosody,
628
+ prosody_cfg_path=str(prosody_cfg) if use_prosody else "",
629
+ prosody_ckpt_path=str(prosody_ckpt) if use_prosody else "",
630
+ )
631
+ logging.info(f"Loaded LEMAS-TTS edit model from {ckpt_file}")
632
+
633
+ return gr.Accordion()
634
+
635
+
636
+ def get_transcribe_state(segments):
637
+ logging.info("===========After Align===========")
638
+ logging.info(segments)
639
+ return {
640
+ "segments": segments,
641
+ "transcript": segments["text_raw"],
642
+ "words_info": segments["words"],
643
+ "transcript_with_start_time": " ".join([f"{word['start']} {word['word']}" for word in segments["words"]]),
644
+ "transcript_with_end_time": " ".join([f"{word['word']} {word['end']}" for word in segments["words"]]),
645
+ "word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in segments["words"]]
646
+ }
647
+
648
+
649
+ def transcribe(seed, audio_info):
650
+ if transcribe_model is None:
651
+ raise gr.Error("Transcription model not loaded")
652
+ seed_everything(seed)
653
+
654
+ segments = transcribe_model.transcribe(audio_info)
655
+ state = get_transcribe_state(segments)
656
+
657
+ return [
658
+ state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
659
+ # gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # prompt_to_word
660
+ gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word
661
+ gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word
662
+ state
663
+ ]
664
+
665
+ def align(transcript, audio_info, state):
666
+ lang = state["segments"]["lang"]
667
+ # print("realign: ", transcript, state)
668
+ transcript = re.sub(_whitespace_re, " ", transcript)
669
+ transcript = transcript[1:] if transcript[0] == " " else transcript
670
+ segments = {'lang':lang, 'text':transcript, 'text_raw':transcript}
671
+ if lang == "zh":
672
+ segments["text"] = text_norm.txt2pinyin(transcript)
673
+ else:
674
+ transcript = replace_numbers_with_words(transcript)
675
+ segments["text"] = (transcript.split(' '), transcript.split(' '))
676
+ # print("text:", segments["text"])
677
+ segments = align_model.align(segments, audio_info)
678
+
679
+ state = get_transcribe_state(segments)
680
+
681
+ return [
682
+ state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
683
+ # gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # prompt_to_word
684
+ gr.Dropdown(value=state["word_bounds"][0], choices=state["word_bounds"], interactive=True), # edit_from_word
685
+ gr.Dropdown(value=state["word_bounds"][-1], choices=state["word_bounds"], interactive=True), # edit_to_word
686
+ state
687
+ ]
688
+
689
+
690
+ def denoise(audio_info):
691
+ denoised_audio, sr = denoise_model.denoise(audio_info)
692
+ denoised_audio = denoised_audio # .squeeze().numpy()
693
+ return (sr, denoised_audio)
694
+
695
+ def cancel_denoise(audio_info):
696
+ return audio_info
697
+
698
+ def get_output_audio(audio_tensors, sr):
699
+ result = torch.cat(audio_tensors, -1)
700
+ result = result.squeeze().cpu().numpy()
701
+ result = (result * np.iinfo(np.int16).max).astype(np.int16)
702
+ print("save result:", result.shape)
703
+ # wavfile.write(os.path.join(TMP_PATH, "output.wav"), sr, result)
704
+ return (int(sr), result)
705
+
706
+
707
+ def get_edit_audio_part(audio_info, edit_start, edit_end):
708
+ sr, raw_wav = audio_info
709
+ raw_wav = raw_wav[int(edit_start*sr):int(edit_end*sr)]
710
+ return (sr, raw_wav)
711
+
712
+
713
+ def crossfade_concat(chunk1, chunk2, overlap):
714
+ # 计算淡入和淡出系数
715
+ fade_out = torch.cos(torch.linspace(0, torch.pi / 2, overlap)) ** 2
716
+ fade_in = torch.cos(torch.linspace(torch.pi / 2, 0, overlap)) ** 2
717
+ chunk2[:overlap] = chunk1[-overlap:] * fade_out + chunk2[:overlap] * fade_in
718
+ chunk = torch.cat((chunk1[:-overlap], chunk2), dim=0)
719
+ return chunk
720
+
721
+ def replace_numbers_with_words(sentence, lang="en"):
722
+ sentence = re.sub(r'(\d+)', r' \1 ', sentence) # add spaces around numbers
723
+ def replace_with_words(match):
724
+ num = match.group(0)
725
+ try:
726
+ return num2words(num, lang=lang) # Convert numbers to words
727
+ except:
728
+ return num # In case num2words fails (unlikely with digits but just to be safe)
729
+ return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
730
+
731
+
732
+ def run(seed, nfe_step, speed, cfg_strength, sway_sampling_coef, ref_ratio,
733
+ audio_info, denoised_audio, transcribe_state, transcript, smart_transcript,
734
+ mode, start_time, end_time,
735
+ split_text, selected_sentence, audio_tensors):
736
+ if tts_edit_model is None:
737
+ raise gr.Error("LEMAS-TTS edit model not loaded")
738
+ if smart_transcript and (transcribe_state is None):
739
+ raise gr.Error("Can't use smart transcript: whisper transcript not found")
740
+
741
+ # if mode == "Rerun":
742
+ # colon_position = selected_sentence.find(':')
743
+ # selected_sentence_idx = int(selected_sentence[:colon_position])
744
+ # sentences = [selected_sentence[colon_position + 1:]]
745
+
746
+ # Choose base audio (denoised if duration matches)
747
+ audio_base = audio_info
748
+ audio_dur = round(audio_info[1].shape[0] / audio_info[0], ndigits=3)
749
+ if denoised_audio is not None:
750
+ denoised_dur = round(denoised_audio[1].shape[0] / denoised_audio[0], ndigits=3)
751
+ if audio_dur == denoised_dur or (
752
+ denoised_audio[0] != audio_info[0] and abs(audio_dur - denoised_dur) < 0.1
753
+ ):
754
+ audio_base = denoised_audio
755
+ logging.info("use denoised audio")
756
+
757
+ raw_sr, raw_wav = audio_base
758
+ print("audio_dur: ", audio_dur, raw_sr, raw_wav.shape, start_time, end_time)
759
+
760
+ # Build target text by replacing the selected span with `transcript`
761
+ words = transcribe_state["words_info"]
762
+ if not words:
763
+ raise gr.Error("No word-level alignment found; please run Transcribe first.")
764
+
765
+ start_time = float(start_time)
766
+ end_time = float(end_time)
767
+ if end_time <= start_time:
768
+ raise gr.Error("Edit end time must be greater than start time.")
769
+
770
+ # Find word indices covering the selected region
771
+ start_idx = 0
772
+ for i, w in enumerate(words):
773
+ if w["end"] > start_time:
774
+ start_idx = i
775
+ break
776
+
777
+ end_idx = len(words)
778
+ for i in range(len(words) - 1, -1, -1):
779
+ if words[i]["start"] < end_time:
780
+ end_idx = i + 1
781
+ break
782
+ if end_idx <= start_idx:
783
+ end_idx = min(start_idx + 1, len(words))
784
+
785
+ word_start_sec = float(words[start_idx]["start"])
786
+ word_end_sec = float(words[end_idx - 1]["end"])
787
+
788
+ # Edit span in seconds (relative to full utterance)
789
+ edit_start = max(0.0, word_start_sec - 0.1)
790
+ edit_end = min(word_end_sec + 0.1, audio_dur)
791
+ parts_to_edit = [(edit_start, edit_end)]
792
+
793
+ display_text = transcribe_state["segments"]["text_raw"].strip()
794
+ txt_list = display_text.split(" ") if display_text else [w["word"] for w in words]
795
+
796
+ prefix = " ".join(txt_list[:start_idx]).strip()
797
+ suffix = " ".join(txt_list[end_idx:]).strip()
798
+ new_phrase = transcript.strip()
799
+
800
+ pieces = []
801
+ if prefix:
802
+ pieces.append(prefix)
803
+ if new_phrase:
804
+ pieces.append(new_phrase)
805
+ if suffix:
806
+ pieces.append(suffix)
807
+ target_text = " ".join(pieces)
808
+
809
+ logging.info(
810
+ "target_text: %s (start_idx=%d, end_idx=%d, parts_to_edit=%s)",
811
+ target_text,
812
+ start_idx,
813
+ end_idx,
814
+ parts_to_edit,
815
+ )
816
+
817
+ # Prepare audio for LEMAS-TTS editing (mono, target SR)
818
+ segment_audio = load_wav(audio_base, sr=tts_edit_model.target_sample_rate)
819
+
820
+ seed_val = None if seed == -1 else int(seed)
821
+
822
+ wav_out, _ = gen_wav_multilingual(
823
+ tts_edit_model,
824
+ segment_audio,
825
+ tts_edit_model.target_sample_rate,
826
+ target_text,
827
+ parts_to_edit,
828
+ nfe_step=int(nfe_step),
829
+ cfg_strength=float(cfg_strength),
830
+ sway_sampling_coef=float(sway_sampling_coef),
831
+ ref_ratio=float(ref_ratio),
832
+ no_ref_audio=False,
833
+ use_acc_grl=False,
834
+ use_prosody_encoder_flag=True,
835
+ seed=seed_val,
836
+ )
837
+
838
+ wav_np = wav_out.cpu().numpy()
839
+ wav_np = np.clip(wav_np, -0.999, 0.999)
840
+ wav_int16 = (wav_np * np.iinfo(np.int16).max).astype(np.int16)
841
+ out_sr = int(tts_edit_model.target_sample_rate)
842
+
843
+ output_audio = (out_sr, wav_int16)
844
+ sentences = [f"0: {target_text}"]
845
+ audio_tensors = [torch.from_numpy(wav_np)]
846
+
847
+ component = gr.Dropdown(choices=sentences, value=sentences[0])
848
+ return output_audio, target_text, component, audio_tensors
849
+
850
+
851
+ def update_input_audio(audio_info):
852
+ if audio_info is None:
853
+ return 0, 0, 0
854
+ elif type(audio_info) is str:
855
+ info = torchaudio.info(audio_path)
856
+ max_time = round(info.num_frames / info.sample_rate, 2)
857
+ elif type(audio_info) is tuple:
858
+ max_time = round(audio_info[1].shape[0] / audio_info[0], 2)
859
+ return [
860
+ # gr.Slider(maximum=max_time, value=max_time),
861
+ gr.Slider(maximum=max_time, value=0),
862
+ gr.Slider(maximum=max_time, value=max_time),
863
+ ]
864
+
865
+
866
+ def change_mode(mode):
867
+ # tts_mode_controls, edit_mode_controls, edit_word_mode, split_text, long_tts_sentence_editor
868
+ return [
869
+ gr.Group(visible=mode != "Edit"),
870
+ gr.Group(visible=mode == "Edit"),
871
+ gr.Radio(visible=mode == "Edit"),
872
+ gr.Radio(visible=mode == "Long TTS"),
873
+ gr.Group(visible=mode == "Long TTS"),
874
+ ]
875
+
876
+
877
+ def load_sentence(selected_sentence, audio_tensors):
878
+ if selected_sentence is None:
879
+ return None
880
+ colon_position = selected_sentence.find(':')
881
+ selected_sentence_idx = int(selected_sentence[:colon_position])
882
+ # Use LEMAS-TTS target sample rate if available, otherwise default to 16000
883
+ sr = getattr(tts_edit_model, "target_sample_rate", 16000)
884
+ return get_output_audio([audio_tensors[selected_sentence_idx]], sr)
885
+
886
+
887
+ def update_bound_word(is_first_word, selected_word, edit_word_mode):
888
+ if selected_word is None:
889
+ return None
890
+
891
+ word_start_time = float(selected_word.split(' ')[0])
892
+ word_end_time = float(selected_word.split(' ')[-1])
893
+ if edit_word_mode == "Replace half":
894
+ bound_time = (word_start_time + word_end_time) / 2
895
+ elif is_first_word:
896
+ bound_time = word_start_time
897
+ else:
898
+ bound_time = word_end_time
899
+
900
+ return bound_time
901
+
902
+
903
+ def update_bound_words(from_selected_word, to_selected_word, edit_word_mode):
904
+ return [
905
+ update_bound_word(True, from_selected_word, edit_word_mode),
906
+ update_bound_word(False, to_selected_word, edit_word_mode),
907
+ ]
908
+
909
+
910
+ smart_transcript_info = """
911
+ If enabled, the target transcript will be constructed for you:</br>
912
+ - In TTS and Long TTS mode just write the text you want to synthesize.</br>
913
+ - In Edit mode just write the text to replace selected editing segment.</br>
914
+ If disabled, you should write the target transcript yourself:</br>
915
+ - In TTS mode write prompt transcript followed by generation transcript.</br>
916
+ - In Long TTS select split by newline (<b>SENTENCE SPLIT WON'T WORK</b>) and start each line with a prompt transcript.</br>
917
+ - In Edit mode write full prompt</br>
918
+ """
919
+
920
+ demo_original_transcript = ""
921
+
922
+ demo_text = {
923
+ "TTS": {
924
+ "smart": "take over the stage for half an hour,",
925
+ "regular": "Gwynplaine had, besides, for his work and for his feats of strength, take over the stage for half an hour."
926
+ },
927
+ "Edit": {
928
+ "smart": "Just write it line-by-line.",
929
+ "regular": "照片、医疗记录、神经重塑的易损性,这是某种数据库啊!还有PRELESS的脑部扫描、生物管型、神经重塑."
930
+ },
931
+ "Long TTS": {
932
+ "smart": "You can run the model on a big text!\n"
933
+ "Just write it line-by-line. Or sentence-by-sentence.\n"
934
+ "If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!",
935
+ "regular": "Gwynplaine had, besides, for his work and for his feats of strength, You can run the model on a big text!\n"
936
+ "Gwynplaine had, besides, for his work and for his feats of strength, Just write it line-by-line. Or sentence-by-sentence.\n"
937
+ "Gwynplaine had, besides, for his work and for his feats of strength, If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!"
938
+ }
939
+ }
940
+
941
+ all_demo_texts = {vv for k, v in demo_text.items() for kk, vv in v.items()}
942
+
943
+ demo_words = ['0.069 Gwynplain 0.611', '0.671 had, 0.912', '0.952 besides, 1.414', '1.494 for 1.634', '1.695 his 1.835', '1.915 work 2.136', '2.196 and 2.297', '2.337 for 2.517', '2.557 his 2.678', '2.758 feats 3.019', '3.079 of 3.139', '3.2 strength, 3.561', '4.022 round 4.263', '4.303 his 4.444', '4.524 neck 4.705', '4.745 and 4.825', '4.905 over 5.086', '5.146 his 5.266', '5.307 shoulders, 5.768', '6.23 an 6.33', '6.531 esclavine 7.133', '7.213 of 7.293', '7.353 leather. 7.614']
944
+
945
+ demo_words_info = [{'word': 'Gwynplain', 'start': 0.069, 'end': 0.611, 'score': 0.833}, {'word': 'had,', 'start': 0.671, 'end': 0.912, 'score': 0.879}, {'word': 'besides,', 'start': 0.952, 'end': 1.414, 'score': 0.863}, {'word': 'for', 'start': 1.494, 'end': 1.634, 'score': 0.89}, {'word': 'his', 'start': 1.695, 'end': 1.835, 'score': 0.669}, {'word': 'work', 'start': 1.915, 'end': 2.136, 'score': 0.916}, {'word': 'and', 'start': 2.196, 'end': 2.297, 'score': 0.766}, {'word': 'for', 'start': 2.337, 'end': 2.517, 'score': 0.808}, {'word': 'his', 'start': 2.557, 'end': 2.678, 'score': 0.786}, {'word': 'feats', 'start': 2.758, 'end': 3.019, 'score': 0.97}, {'word': 'of', 'start': 3.079, 'end': 3.139, 'score': 0.752}, {'word': 'strength,', 'start': 3.2, 'end': 3.561, 'score': 0.742}, {'word': 'round', 'start': 4.022, 'end': 4.263, 'score': 0.916}, {'word': 'his', 'start': 4.303, 'end': 4.444, 'score': 0.666}, {'word': 'neck', 'start': 4.524, 'end': 4.705, 'score': 0.908}, {'word': 'and', 'start': 4.745, 'end': 4.825, 'score': 0.882}, {'word': 'over', 'start': 4.905, 'end': 5.086, 'score': 0.847}, {'word': 'his', 'start': 5.146, 'end': 5.266, 'score': 0.791}, {'word': 'shoulders,', 'start': 5.307, 'end': 5.768, 'score': 0.729}, {'word': 'an', 'start': 6.23, 'end': 6.33, 'score': 0.854}, {'word': 'esclavine', 'start': 6.531, 'end': 7.133, 'score': 0.803}, {'word': 'of', 'start': 7.213, 'end': 7.293, 'score': 0.772}, {'word': 'leather.', 'start': 7.353, 'end': 7.614, 'score': 0.896}]
946
+
947
+
948
+ def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word):
949
+ if transcript not in all_demo_texts:
950
+ return transcript, edit_from_word, edit_to_word
951
+
952
+ replace_half = edit_word_mode == "Replace half"
953
+ change_edit_from_word = edit_from_word == demo_words[2] or edit_from_word == demo_words[3]
954
+ change_edit_to_word = edit_to_word == demo_words[11] or edit_to_word == demo_words[12]
955
+ demo_edit_from_word_value = demo_words[2] if replace_half else demo_words[3]
956
+ demo_edit_to_word_value = demo_words[12] if replace_half else demo_words[11]
957
+ return [
958
+ demo_text[mode]["smart" if smart_transcript else "regular"],
959
+ demo_edit_from_word_value if change_edit_from_word else edit_from_word,
960
+ demo_edit_to_word_value if change_edit_to_word else edit_to_word,
961
+ ]
962
+
963
+ def get_app():
964
+ with gr.Blocks() as app:
965
+ with gr.Row():
966
+ with gr.Column(scale=2):
967
+ load_models_btn = gr.Button(value="Load models")
968
+ with gr.Column(scale=5):
969
+ with gr.Accordion("Select models", open=False) as models_selector:
970
+ # For LEMAS-TTS editing, we expose a simple model selector
971
+ # between the two multilingual variants.
972
+ lemas_model_choice = gr.Radio(
973
+ label="LEMAS-TTS Model",
974
+ choices=["multilingual_grl", "multilingual_prosody"],
975
+ value="multilingual_grl",
976
+ interactive=True,
977
+ )
978
+ with gr.Row():
979
+ denoise_model_choice = gr.Radio(label="Denoise Model", scale=2, value="UVR5", choices=["UVR5", "DeepFilterNet"]) # "830M", "330M_TTSEnhanced", "830M_TTSEnhanced"])
980
+ # whisper_backend_choice = gr.Radio(label="Whisper backend", value="", choices=["whisperX", "whisper"])
981
+ whisper_model_choice = gr.Radio(label="Whisper model", scale=3, value="medium", choices=["base", "small", "medium", "large"])
982
+ align_model_choice = gr.Radio(label="Forced alignment model", scale=2, value="MMS", choices=["whisperX", "MMS"], visible=False)
983
+ # audiosr_choice = gr.Radio(label="AudioSR model", scale=2, value="None", choices=["basic", "speech", "None"])
984
+
985
+ with gr.Row():
986
+ with gr.Column(scale=2):
987
+ input_audio = gr.Audio(value=f"{DEMO_PATH}/V-00013_en-US.wav", label="Input Audio", interactive=True)
988
+
989
+ with gr.Row():
990
+ transcribe_btn = gr.Button(value="Transcribe")
991
+ align_btn = gr.Button(value="ReAlign")
992
+ with gr.Group():
993
+ original_transcript = gr.Textbox(label="Original transcript", lines=5, interactive=True, value=demo_original_transcript,
994
+ info="Use whisperx model to get the transcript. Fix and align it if necessary.")
995
+ with gr.Accordion("Word start time", open=False, visible=False):
996
+ transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
997
+ with gr.Accordion("Word end time", open=False, visible=False):
998
+ transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
999
+
1000
+ with gr.Row():
1001
+ denoise_btn = gr.Button(value="Denoise")
1002
+ cancel_btn = gr.Button(value="Cancel Denoise")
1003
+ denoise_audio = gr.Audio(label="Denoised Audio", value=None, interactive=False)
1004
+
1005
+ with gr.Column(scale=3):
1006
+ with gr.Group():
1007
+ transcript_inbox = gr.Textbox(label="Text", lines=5, value=demo_text["Edit"]["smart"])
1008
+ with gr.Row(visible=False):
1009
+ smart_transcript = gr.Checkbox(label="Smart transcript", value=True)
1010
+ with gr.Accordion(label="?", open=False):
1011
+ info = gr.Markdown(value=smart_transcript_info)
1012
+
1013
+ mode = gr.Radio(label="Mode", choices=["Edit"], value="Edit", visible=False)
1014
+ with gr.Row(visible=False):
1015
+ split_text = gr.Radio(label="Split text", choices=["Newline", "Sentence"], value="Newline",
1016
+ info="Split text into parts and run TTS for each part.", visible=True)
1017
+ edit_word_mode = gr.Radio(label="Edit word mode", choices=["Replace half", "Replace all"], value="Replace all",
1018
+ info="What to do with first and last word", visible=False)
1019
+
1020
+ # with gr.Group(visible=False) as tts_mode_controls:
1021
+ # with gr.Row():
1022
+ # edit_from_word = gr.Dropdown(label="First word in prompt", choices=demo_words, value=demo_words[12], interactive=True)
1023
+ # edit_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[18], interactive=True)
1024
+ # with gr.Row():
1025
+ # edit_start_time = gr.Slider(label="Prompt start time", minimum=0, maximum=7.614, step=0.001, value=4.022)
1026
+ # edit_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.614, step=0.001, value=5.768)
1027
+ # with gr.Row():
1028
+ # check_btn = gr.Button(value="Check prompt",scale=1)
1029
+ # edit_audio = gr.Audio(label="Prompt Audio", scale=3)
1030
+
1031
+ # with gr.Group() as edit_mode_controls:
1032
+ with gr.Row():
1033
+ edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[12], interactive=True)
1034
+ edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[18], interactive=True)
1035
+ with gr.Row():
1036
+ edit_start_time = gr.Slider(label="Edit from time", minimum=0, maximum=7.614, step=0.001, value=4.022)
1037
+ edit_end_time = gr.Slider(label="Edit to time", minimum=0, maximum=7.614, step=0.001, value=5.768)
1038
+ with gr.Row():
1039
+ check_btn = gr.Button(value="Check edit words",scale=1)
1040
+ edit_audio = gr.Audio(label="Edit word(s)", scale=3)
1041
+
1042
+ run_btn = gr.Button(value="Run", variant="primary")
1043
+
1044
+ with gr.Column(scale=2):
1045
+ output_audio = gr.Audio(label="Output Audio")
1046
+ with gr.Accordion("Inference transcript", open=True):
1047
+ inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False, info="Inference was performed on this transcript.")
1048
+ with gr.Group(visible=False) as long_tts_sentence_editor:
1049
+ sentence_selector = gr.Dropdown(label="Sentence", value=None,
1050
+ info="Select sentence you want to regenerate")
1051
+ sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
1052
+ rerun_btn = gr.Button(value="Rerun")
1053
+
1054
+ with gr.Row():
1055
+ with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
1056
+ with gr.Row():
1057
+ nfe_step = gr.Number(
1058
+ label="NFE Step",
1059
+ value=64,
1060
+ precision=0,
1061
+ info="Number of function evaluations (sampling steps).",
1062
+ )
1063
+ speed = gr.Slider(
1064
+ label="Speed",
1065
+ minimum=0.5,
1066
+ maximum=1.5,
1067
+ step=0.05,
1068
+ value=1.0,
1069
+ info="Placeholder for future use; currently not applied.",
1070
+ )
1071
+ cfg_strength = gr.Slider(
1072
+ label="CFG Strength",
1073
+ minimum=0.0,
1074
+ maximum=10.0,
1075
+ step=0.5,
1076
+ value=5.0,
1077
+ info="Classifier-free guidance strength.",
1078
+ )
1079
+
1080
+ with gr.Row():
1081
+ sway_sampling_coef = gr.Slider(
1082
+ label="Sway",
1083
+ minimum=2.0,
1084
+ maximum=5.0,
1085
+ step=0.1,
1086
+ value=3.0,
1087
+ info="Sampling sway coefficient.",
1088
+ )
1089
+ ref_ratio = gr.Slider(
1090
+ label="Ref Ratio",
1091
+ minimum=0.0,
1092
+ maximum=1.0,
1093
+ step=0.05,
1094
+ value=1.0,
1095
+ info="How much to rely on reference audio (if used).",
1096
+ )
1097
+ seed = gr.Number(
1098
+ label="Seed",
1099
+ value=-1,
1100
+ precision=0,
1101
+ info="-1 for random, otherwise fixed seed.",
1102
+ )
1103
+
1104
+
1105
+ audio_tensors = gr.State()
1106
+ transcribe_state = gr.State(value={"words_info": demo_words_info, "lang":"zh"})
1107
+
1108
+
1109
+ edit_word_mode.change(fn=update_demo,
1110
+ inputs=[mode, smart_transcript, edit_word_mode, transcript_inbox, edit_from_word, edit_to_word],
1111
+ outputs=[transcript_inbox, edit_from_word, edit_to_word])
1112
+ smart_transcript.change(
1113
+ fn=update_demo,
1114
+ inputs=[mode, smart_transcript, edit_word_mode, transcript_inbox, edit_from_word, edit_to_word],
1115
+ outputs=[transcript_inbox, edit_from_word, edit_to_word],
1116
+ )
1117
+
1118
+ load_models_btn.click(fn=load_models,
1119
+ inputs=[lemas_model_choice, whisper_model_choice, align_model_choice, denoise_model_choice], # audiosr_choice],
1120
+ outputs=[models_selector])
1121
+
1122
+ input_audio.upload(fn=update_input_audio,
1123
+ inputs=[input_audio],
1124
+ outputs=[edit_start_time, edit_end_time]) # prompt_end_time
1125
+
1126
+ transcribe_btn.click(fn=transcribe,
1127
+ inputs=[seed, input_audio],
1128
+ outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time,
1129
+ edit_from_word, edit_to_word, transcribe_state]) # prompt_to_word
1130
+ align_btn.click(fn=align,
1131
+ inputs=[original_transcript, input_audio, transcribe_state],
1132
+ outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time,
1133
+ edit_from_word, edit_to_word, transcribe_state]) # prompt_to_word
1134
+
1135
+ denoise_btn.click(fn=denoise,
1136
+ inputs=[input_audio],
1137
+ outputs=[denoise_audio])
1138
+
1139
+ cancel_btn.click(fn=cancel_denoise,
1140
+ inputs=[input_audio],
1141
+ outputs=[denoise_audio])
1142
+
1143
+ # mode.change(fn=change_mode,
1144
+ # inputs=[mode],
1145
+ # outputs=[tts_mode_controls, edit_mode_controls, edit_word_mode, split_text, long_tts_sentence_editor])
1146
+
1147
+ check_btn.click(fn=get_edit_audio_part,
1148
+ inputs=[input_audio, edit_start_time, edit_end_time],
1149
+ outputs=[edit_audio])
1150
+
1151
+ run_btn.click(fn=run,
1152
+ inputs=[
1153
+ seed, nfe_step, speed, cfg_strength, sway_sampling_coef, ref_ratio,
1154
+ input_audio, denoise_audio, transcribe_state, transcript_inbox, smart_transcript,
1155
+ mode, edit_start_time, edit_end_time,
1156
+ split_text, sentence_selector, audio_tensors
1157
+ ],
1158
+ outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
1159
+
1160
+ sentence_selector.change(
1161
+ fn=load_sentence,
1162
+ inputs=[sentence_selector, audio_tensors],
1163
+ outputs=[sentence_audio],
1164
+ )
1165
+ rerun_btn.click(fn=run,
1166
+ inputs=[
1167
+ seed, nfe_step, speed, cfg_strength, sway_sampling_coef, ref_ratio,
1168
+ input_audio, denoise_audio, transcribe_state, transcript_inbox, smart_transcript,
1169
+ gr.State(value="Rerun"), edit_start_time, edit_end_time,
1170
+ split_text, sentence_selector, audio_tensors
1171
+ ],
1172
+ outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
1173
+
1174
+ # prompt_to_word.change(fn=update_bound_word,
1175
+ # inputs=[gr.State(False), prompt_to_word, gr.State("Replace all")],
1176
+ # outputs=[prompt_end_time])
1177
+ edit_from_word.change(fn=update_bound_word,
1178
+ inputs=[gr.State(True), edit_from_word, edit_word_mode],
1179
+ outputs=[edit_start_time])
1180
+ edit_to_word.change(fn=update_bound_word,
1181
+ inputs=[gr.State(False), edit_to_word, edit_word_mode],
1182
+ outputs=[edit_end_time])
1183
+ edit_word_mode.change(fn=update_bound_words,
1184
+ inputs=[edit_from_word, edit_to_word, edit_word_mode],
1185
+ outputs=[edit_start_time, edit_end_time])
1186
+
1187
+ return app
1188
+
1189
+
1190
+ if __name__ == "__main__":
1191
+ import argparse
1192
+
1193
+ parser = argparse.ArgumentParser(description="VoiceCraft gradio app.")
1194
+
1195
+ parser.add_argument("--demo-path", default="./demo", help="Path to demo directory")
1196
+ parser.add_argument("--tmp-path", default="/cto_labs/vistring/zhaozhiyuan/outputs/voicecraft/tmp", help="Path to tmp directory")
1197
+ parser.add_argument("--models-path", default="/cto_labs/vistring/zhaozhiyuan/outputs/voicecraft/pretrain/VoiceCraft", help="Path to voicecraft models directory")
1198
+ parser.add_argument("--port", default=41020, type=int, help="App port")
1199
+ parser.add_argument("--share", action="store_true", help="Launch with public url")
1200
+ parser.add_argument("--server_name", default="0.0.0.0", type=str, help="Server name for launching the app. 127.0.0.1 for localhost; 0.0.0.0 to allow access from other machines in the local network. Might also give access to external users depends on the firewall settings.")
1201
+
1202
+ os.environ["USER"] = os.getenv("USER", "user")
1203
+ args = parser.parse_args()
1204
+ DEMO_PATH = args.demo_path
1205
+ TMP_PATH = args.tmp_path
1206
+ MODELS_PATH = args.models_path
1207
+
1208
+ app = get_app()
1209
+ app.queue().launch(share=args.share, server_name=args.server_name, server_port=args.port)
inference_gradio.py ADDED
@@ -0,0 +1,576 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import os
3
+ import platform
4
+ import psutil
5
+ import tempfile
6
+ from glob import glob
7
+ import traceback
8
+ import click
9
+ import gradio as gr
10
+ import torch
11
+ import torchaudio
12
+ import soundfile as sf
13
+ from pathlib import Path
14
+
15
+ from cached_path import cached_path
16
+
17
+ from lemas_tts.api import TTS, PRETRAINED_ROOT, CKPTS_ROOT
18
+
19
+ # Global variables
20
+ tts_api = None
21
+ last_checkpoint = ""
22
+ last_device = ""
23
+ last_ema = None
24
+
25
+ # Device detection
26
+ device = (
27
+ "cuda"
28
+ if torch.cuda.is_available()
29
+ else "xpu"
30
+ if torch.xpu.is_available()
31
+ else "mps"
32
+ if torch.backends.mps.is_available()
33
+ else "cpu"
34
+ )
35
+
36
+ REPO_ROOT = Path(__file__).resolve().parent
37
+
38
+ # HF location for large TTS checkpoints (too big for Space storage)
39
+ HF_PRETRAINED_ROOT = "hf://LEMAS-Project/LEMAS-TTS/pretrained_models"
40
+
41
+ # 指向 `pretrained_models` 里的 espeak-ng-data(本地自带的字典)
42
+ # 动态库交给系统安装的 espeak-ng 来提供(通过 apt),不强行指定 PHONEMIZER_ESPEAK_LIBRARY,
43
+ # 避免本地复制的 .so 与 Space 基础镜像不兼容。
44
+ ESPEAK_DATA_DIR = Path(PRETRAINED_ROOT) / "espeak-ng-data"
45
+ os.environ["ESPEAK_DATA_PATH"] = str(ESPEAK_DATA_DIR)
46
+ os.environ["ESPEAKNG_DATA_PATH"] = str(ESPEAK_DATA_DIR)
47
+
48
+
49
+ class UVR5:
50
+ """Small wrapper around the bundled uvr5 implementation for denoising."""
51
+
52
+ def __init__(self, model_dir: Path, code_dir: Path):
53
+ self.model = self.load_model(str(model_dir), str(code_dir))
54
+
55
+ def load_model(self, model_dir: str, code_dir: str):
56
+ import sys
57
+ import json
58
+
59
+ if code_dir not in sys.path:
60
+ sys.path.append(code_dir)
61
+
62
+ from multiprocess_cuda_infer import ModelData, Inference
63
+
64
+ model_path = os.path.join(model_dir, "Kim_Vocal_1.onnx")
65
+ config_path = os.path.join(model_dir, "MDX-Net-Kim-Vocal1.json")
66
+ with open(config_path, "r", encoding="utf-8") as f:
67
+ configs = json.load(f)
68
+ model_data = ModelData(
69
+ model_path=model_path,
70
+ audio_path=model_dir,
71
+ result_path=model_dir,
72
+ device="cpu",
73
+ process_method="MDX-Net",
74
+ base_dir=model_dir, # keep base_dir and model_dir the same (paths under `pretrained_models`)
75
+ **configs,
76
+ )
77
+
78
+ uvr5_model = Inference(model_data, "cpu")
79
+ uvr5_model.load_model(model_path, 1)
80
+ return uvr5_model
81
+
82
+ def denoise(self, audio_info):
83
+ print("denoise UVR5: ", audio_info)
84
+ input_audio = load_wav(audio_info, sr=44100, channel=2)
85
+ output_audio = self.model.demix_base({0: input_audio.squeeze()}, is_match_mix=False)
86
+ return output_audio.squeeze().T.numpy(), 44100
87
+
88
+ denoise_model = UVR5(
89
+ model_dir=str(Path(PRETRAINED_ROOT) / "uvr5"),
90
+ code_dir=str(REPO_ROOT / "uvr5"),
91
+ )
92
+
93
+ def load_wav(audio_info, sr=16000, channel=1):
94
+ print("load audio:", audio_info)
95
+ audio, raw_sr = torchaudio.load(audio_info)
96
+ audio = audio.T if len(audio.shape) > 1 and audio.shape[1] == 2 else audio
97
+ audio = audio / torch.max(torch.abs(audio))
98
+ audio = audio.squeeze().float()
99
+ if channel == 1 and len(audio.shape) == 2: # stereo to mono
100
+ audio = audio.mean(dim=0, keepdim=True)
101
+ elif channel == 2 and len(audio.shape) == 1:
102
+ audio = torch.stack((audio, audio)) # mono to stereo
103
+ if raw_sr != sr:
104
+ audio = torchaudio.functional.resample(audio.squeeze(), raw_sr, sr)
105
+ audio = torch.clip(audio, -0.999, 0.999).squeeze()
106
+ return audio
107
+
108
+
109
+ def denoise(audio_info):
110
+ save_path = "./denoised_audio.wav"
111
+ denoised_audio, sr = denoise_model.denoise(audio_info)
112
+ sf.write(save_path, denoised_audio, sr, format='wav', subtype='PCM_24')
113
+ print("save denoised audio:", save_path)
114
+ return save_path
115
+
116
+ def cancel_denoise(audio_info):
117
+ return audio_info
118
+
119
+
120
+ def get_checkpoints_project(project_name=None, is_gradio=True):
121
+ """Get available checkpoint files"""
122
+ checkpoint_dir = [str(CKPTS_ROOT)]
123
+ # Remote ckpt locations on HF (used when local ckpts are not present)
124
+ remote_ckpts = {
125
+ "multilingual_grl": f"{HF_PRETRAINED_ROOT}/ckpts/multilingual_grl/multilingual_grl.safetensors",
126
+ "multilingual_prosody": f"{HF_PRETRAINED_ROOT}/ckpts/multilingual_prosody/multilingual_prosody.safetensors",
127
+ }
128
+
129
+ if project_name is None:
130
+ # Look for checkpoints in local directory
131
+ files_checkpoints = []
132
+ for path in checkpoint_dir:
133
+ if os.path.isdir(path):
134
+ files_checkpoints.extend(glob(os.path.join(path, "**/*.pt"), recursive=True))
135
+ files_checkpoints.extend(glob(os.path.join(path, "**/*.safetensors"), recursive=True))
136
+ break
137
+ # Fallback to remote ckpts if none found locally
138
+ if not files_checkpoints:
139
+ files_checkpoints = list(remote_ckpts.values())
140
+ else:
141
+ files_checkpoints = []
142
+ if os.path.isdir(checkpoint_dir[0]):
143
+ files_checkpoints = glob(os.path.join(checkpoint_dir[0], project_name, "*.pt"))
144
+ files_checkpoints.extend(glob(os.path.join(checkpoint_dir[0], project_name, "*.safetensors")))
145
+ # If no local ckpts for this project, try remote mapping
146
+ if not files_checkpoints:
147
+ ckpt = remote_ckpts.get(project_name)
148
+ files_checkpoints = [ckpt] if ckpt is not None else []
149
+ print("files_checkpoints:", project_name, files_checkpoints)
150
+ # Separate pretrained and regular checkpoints
151
+ pretrained_checkpoints = [f for f in files_checkpoints if "pretrained_" in os.path.basename(f)]
152
+ regular_checkpoints = [
153
+ f
154
+ for f in files_checkpoints
155
+ if "pretrained_" not in os.path.basename(f) and "model_last.pt" not in os.path.basename(f)
156
+ ]
157
+ last_checkpoint = [f for f in files_checkpoints if "model_last.pt" in os.path.basename(f)]
158
+
159
+ # Sort regular checkpoints by number
160
+ try:
161
+ regular_checkpoints = sorted(
162
+ regular_checkpoints, key=lambda x: int(os.path.basename(x).split("_")[1].split(".")[0])
163
+ )
164
+ except (IndexError, ValueError):
165
+ regular_checkpoints = sorted(regular_checkpoints)
166
+
167
+ # Combine in order: pretrained, regular, last
168
+ files_checkpoints = pretrained_checkpoints + regular_checkpoints + last_checkpoint
169
+
170
+ select_checkpoint = None if not files_checkpoints else files_checkpoints[-1]
171
+
172
+ if is_gradio:
173
+ return gr.update(choices=files_checkpoints, value=select_checkpoint)
174
+
175
+ return files_checkpoints, select_checkpoint
176
+
177
+
178
+ def get_available_projects():
179
+ """Get available project names from data directory"""
180
+ data_paths = [
181
+ str(Path(PRETRAINED_ROOT) / "data"),
182
+ ]
183
+
184
+ project_list = []
185
+ for data_path in data_paths:
186
+ if os.path.isdir(data_path):
187
+ for folder in os.listdir(data_path):
188
+ path_folder = os.path.join(data_path, folder)
189
+ if "test" not in folder:
190
+ project_list.append(folder)
191
+ break
192
+ # Fallback: if no local data dir, default to known HF projects
193
+ if not project_list:
194
+ project_list = ["multilingual_grl", "multilingual_prosody"]
195
+ project_list.sort()
196
+ print("project_list:", project_list)
197
+ return project_list
198
+
199
+
200
+ def infer(
201
+ project, file_checkpoint, exp_name, ref_text, ref_audio, denoise_audio, gen_text, nfe_step, use_ema, separate_langs, frontend, speed, cfg_strength, use_acc_grl, ref_ratio, no_ref_audio, sway_sampling_coef, use_prosody_encoder, seed
202
+ ):
203
+ global last_checkpoint, last_device, tts_api, last_ema
204
+
205
+ # Resolve checkpoint path (local or HF URL)
206
+ ckpt_path = file_checkpoint
207
+ if isinstance(ckpt_path, str) and ckpt_path.startswith("hf://"):
208
+ try:
209
+ ckpt_resolved = str(cached_path(ckpt_path))
210
+ except Exception as e:
211
+ traceback.print_exc()
212
+ return None, f"Error downloading checkpoint: {str(e)}", ""
213
+ else:
214
+ ckpt_resolved = ckpt_path
215
+
216
+ if not os.path.isfile(ckpt_resolved):
217
+ return None, "Checkpoint not found!", ""
218
+
219
+ if denoise_audio:
220
+ ref_audio = denoise_audio
221
+
222
+ device_test = device # Use the global device
223
+
224
+ if last_checkpoint != ckpt_resolved or last_device != device_test or last_ema != use_ema or tts_api is None:
225
+ if last_checkpoint != ckpt_resolved:
226
+ last_checkpoint = ckpt_resolved
227
+
228
+ if last_device != device_test:
229
+ last_device = device_test
230
+
231
+ if last_ema != use_ema:
232
+ last_ema = use_ema
233
+
234
+ # Automatically enable prosody encoder when using the prosody checkpoint
235
+ use_prosody_encoder = True if "prosody" in str(ckpt_resolved) else False
236
+
237
+ # Resolve vocab file (local)
238
+ local_vocab = Path(PRETRAINED_ROOT) / "data" / project / "vocab.txt"
239
+ if not local_vocab.is_file():
240
+ return None, "Vocab file not found!", ""
241
+ vocab_file = str(local_vocab)
242
+
243
+ # Resolve prosody encoder config & weights (local)
244
+ local_prosody_cfg = Path(CKPTS_ROOT) / "prosody_encoder" / "pretssel_cfg.json"
245
+ local_prosody_ckpt = Path(CKPTS_ROOT) / "prosody_encoder" / "prosody_encoder_UnitY2.pt"
246
+ if not local_prosody_cfg.is_file() or not local_prosody_ckpt.is_file():
247
+ return None, "Prosody encoder files not found!", ""
248
+ prosody_cfg_path = str(local_prosody_cfg)
249
+ prosody_ckpt_path = str(local_prosody_ckpt)
250
+
251
+ try:
252
+ tts_api = TTS(
253
+ model=exp_name,
254
+ ckpt_file=ckpt_resolved,
255
+ vocab_file=vocab_file,
256
+ device=device_test,
257
+ use_ema=use_ema,
258
+ frontend=frontend,
259
+ use_prosody_encoder=use_prosody_encoder,
260
+ prosody_cfg_path=prosody_cfg_path,
261
+ prosody_ckpt_path=prosody_ckpt_path,
262
+ )
263
+ except Exception as e:
264
+ traceback.print_exc()
265
+ return None, f"Error loading model: {str(e)}", ""
266
+
267
+ print("Model loaded >>", device_test, file_checkpoint, use_ema)
268
+
269
+ if seed == -1: # -1 used for random
270
+ seed = None
271
+
272
+ try:
273
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
274
+ tts_api.infer(
275
+ ref_file=ref_audio,
276
+ ref_text=ref_text.strip(),
277
+ gen_text=gen_text.strip(),
278
+ nfe_step=nfe_step,
279
+ separate_langs=separate_langs,
280
+ speed=speed,
281
+ cfg_strength=cfg_strength,
282
+ sway_sampling_coef=sway_sampling_coef,
283
+ use_acc_grl=use_acc_grl,
284
+ ref_ratio=ref_ratio,
285
+ no_ref_audio=no_ref_audio,
286
+ use_prosody_encoder=use_prosody_encoder,
287
+ file_wave=f.name,
288
+ seed=seed,
289
+ )
290
+ return f.name, f"Device: {tts_api.device}", str(tts_api.seed)
291
+ except Exception as e:
292
+ traceback.print_exc()
293
+ return None, f"Inference error: {str(e)}", ""
294
+
295
+
296
+ def get_gpu_stats():
297
+ """Get GPU statistics"""
298
+ gpu_stats = ""
299
+
300
+ if torch.cuda.is_available():
301
+ gpu_count = torch.cuda.device_count()
302
+ for i in range(gpu_count):
303
+ gpu_name = torch.cuda.get_device_name(i)
304
+ gpu_properties = torch.cuda.get_device_properties(i)
305
+ total_memory = gpu_properties.total_memory / (1024**3) # in GB
306
+ allocated_memory = torch.cuda.memory_allocated(i) / (1024**2) # in MB
307
+ reserved_memory = torch.cuda.memory_reserved(i) / (1024**2) # in MB
308
+
309
+ gpu_stats += (
310
+ f"GPU {i} Name: {gpu_name}\n"
311
+ f"Total GPU memory (GPU {i}): {total_memory:.2f} GB\n"
312
+ f"Allocated GPU memory (GPU {i}): {allocated_memory:.2f} MB\n"
313
+ f"Reserved GPU memory (GPU {i}): {reserved_memory:.2f} MB\n\n"
314
+ )
315
+ elif torch.xpu.is_available():
316
+ gpu_count = torch.xpu.device_count()
317
+ for i in range(gpu_count):
318
+ gpu_name = torch.xpu.get_device_name(i)
319
+ gpu_properties = torch.xpu.get_device_properties(i)
320
+ total_memory = gpu_properties.total_memory / (1024**3) # in GB
321
+ allocated_memory = torch.xpu.memory_allocated(i) / (1024**2) # in MB
322
+ reserved_memory = torch.xpu.memory_reserved(i) / (1024**2) # in MB
323
+
324
+ gpu_stats += (
325
+ f"GPU {i} Name: {gpu_name}\n"
326
+ f"Total GPU memory (GPU {i}): {total_memory:.2f} GB\n"
327
+ f"Allocated GPU memory (GPU {i}): {allocated_memory:.2f} MB\n"
328
+ f"Reserved GPU memory (GPU {i}): {reserved_memory:.2f} MB\n\n"
329
+ )
330
+ elif torch.backends.mps.is_available():
331
+ gpu_count = 1
332
+ gpu_stats += "MPS GPU\n"
333
+ total_memory = psutil.virtual_memory().total / (
334
+ 1024**3
335
+ ) # Total system memory (MPS doesn't have its own memory)
336
+ allocated_memory = 0
337
+ reserved_memory = 0
338
+
339
+ gpu_stats += (
340
+ f"Total system memory: {total_memory:.2f} GB\n"
341
+ f"Allocated GPU memory (MPS): {allocated_memory:.2f} MB\n"
342
+ f"Reserved GPU memory (MPS): {reserved_memory:.2f} MB\n"
343
+ )
344
+
345
+ else:
346
+ gpu_stats = "No GPU available"
347
+
348
+ return gpu_stats
349
+
350
+
351
+ def get_cpu_stats():
352
+ """Get CPU statistics"""
353
+ cpu_usage = psutil.cpu_percent(interval=1)
354
+ memory_info = psutil.virtual_memory()
355
+ memory_used = memory_info.used / (1024**2)
356
+ memory_total = memory_info.total / (1024**2)
357
+ memory_percent = memory_info.percent
358
+
359
+ pid = os.getpid()
360
+ process = psutil.Process(pid)
361
+ nice_value = process.nice()
362
+
363
+ cpu_stats = (
364
+ f"CPU Usage: {cpu_usage:.2f}%\n"
365
+ f"System Memory: {memory_used:.2f} MB used / {memory_total:.2f} MB total ({memory_percent}% used)\n"
366
+ f"Process Priority (Nice value): {nice_value}"
367
+ )
368
+
369
+ return cpu_stats
370
+
371
+
372
+ def get_combined_stats():
373
+ """Get combined system stats"""
374
+ gpu_stats = get_gpu_stats()
375
+ cpu_stats = get_cpu_stats()
376
+ combined_stats = f"### GPU Stats\n{gpu_stats}\n\n### CPU Stats\n{cpu_stats}"
377
+ return combined_stats
378
+
379
+
380
+ # Create Gradio interface
381
+ with gr.Blocks(title="LEMAS-TTS Inference") as app:
382
+ gr.Markdown(
383
+ """
384
+ # Zero-Shot TTS
385
+
386
+ Set seed to -1 for random generation.
387
+ """
388
+ )
389
+ with gr.Accordion("Model configuration", open=False):
390
+ # Model configuration
391
+ with gr.Row():
392
+ exp_name = gr.Radio(
393
+ label="Model",
394
+ choices=["multilingual_grl", "multilingual_prosody"],
395
+ value="multilingual_grl",
396
+ visible=False,
397
+ )
398
+ # Project selection
399
+ available_projects = get_available_projects()
400
+
401
+ # Get initial checkpoints
402
+ list_checkpoints, checkpoint_select = get_checkpoints_project(available_projects[0] if available_projects else None, False)
403
+
404
+ with gr.Row():
405
+ with gr.Column(scale=1):
406
+ # load_models_btn = gr.Button(value="Load models")
407
+ cm_project = gr.Dropdown(
408
+ choices=available_projects,
409
+ value=available_projects[0] if available_projects else None,
410
+ label="Project",
411
+ allow_custom_value=True,
412
+ scale=4
413
+ )
414
+
415
+ with gr.Column(scale=5):
416
+ cm_checkpoint = gr.Dropdown(
417
+ choices=list_checkpoints, value=checkpoint_select, label="Checkpoints", allow_custom_value=True # scale=4,
418
+ )
419
+ bt_checkpoint_refresh = gr.Button("Refresh", scale=1)
420
+
421
+ with gr.Row():
422
+ ch_use_ema = gr.Checkbox(label="Use EMA", visible=False, value=True, scale=2, info="Turn off at early stage might offer better results")
423
+ frontend = gr.Radio(label="Frontend", visible=False, choices=["phone", "char", "bpe"], value="phone", scale=3)
424
+ separate_langs = gr.Checkbox(label="Separate Languages", visible=False, value=True, scale=2, info="separate language tokens")
425
+
426
+ # Inference parameters
427
+ with gr.Row():
428
+ nfe_step = gr.Number(label="NFE Step", scale=1, value=64)
429
+ speed = gr.Slider(label="Speed", scale=3, value=1.0, minimum=0.5, maximum=1.5, step=0.1)
430
+ cfg_strength = gr.Slider(label="CFG Strength", scale=2, value=5.0, minimum=0.0, maximum=10.0, step=1)
431
+ sway_sampling_coef = gr.Slider(label="Sway Sampling Coef", scale=2, value=3, minimum=2, maximum=5, step=0.1)
432
+ ref_ratio = gr.Slider(label="Ref Ratio", scale=2, value=1.0, minimum=0.0, maximum=1.0, step=0.1)
433
+ no_ref_audio = gr.Checkbox(label="No Reference Audio", visible=False, value=False, scale=1, info="No mel condition")
434
+ use_acc_grl = gr.Checkbox(label="Use accent grl condition", visible=False, value=True, scale=1, info="Use accent grl condition")
435
+ use_prosody_encoder = gr.Checkbox(label="Use prosody encoder", visible=False, value=False, scale=1, info="Use prosody encoder")
436
+ seed = gr.Number(label="Random Seed", scale=1, value=-1, minimum=-1)
437
+
438
+
439
+ # Input fields
440
+ ref_text = gr.Textbox(label="Reference Text", placeholder="Enter the text for the reference audio...")
441
+ ref_audio = gr.Audio(label="Reference Audio", type="filepath", interactive=True, show_download_button=True, editable=True)
442
+
443
+
444
+ with gr.Accordion("Denoise audio (Optional / Recommend)", open=True):
445
+ with gr.Row():
446
+ denoise_btn = gr.Button(value="Denoise")
447
+ cancel_btn = gr.Button(value="Cancel Denoise")
448
+ denoise_audio = gr.Audio(label="Denoised Audio", value=None, type="filepath", interactive=True, show_download_button=True, editable=True)
449
+
450
+ gen_text = gr.Textbox(label="Text to Generate", placeholder="Enter the text you want to generate...")
451
+
452
+ # Inference button and outputs
453
+ with gr.Row():
454
+ txt_info_gpu = gr.Textbox("", label="Device Info")
455
+ seed_info = gr.Textbox(label="Used Random Seed")
456
+ check_button_infer = gr.Button("Generate Audio", variant="primary")
457
+
458
+ gen_audio = gr.Audio(label="Generated Audio", type="filepath", interactive=True, show_download_button=True, editable=True)
459
+
460
+ # Examples
461
+ def _resolve_example(name: str) -> str:
462
+ local = Path(PRETRAINED_ROOT) / "data" / "test_examples" / name
463
+ return str(local) if local.is_file() else ""
464
+
465
+ examples = gr.Examples(
466
+ examples=[
467
+ ["em, #1 I have a list of YouTubers, and I'm gonna be going to their houses and raiding them by.",
468
+ _resolve_example("en.wav"),
469
+ "我有一份 YouTuber 名单,我打算去他们家,对他们进行突袭。",
470
+ ],
471
+ ["Te voy a dar un tip #1 que le copia a John Rockefeller, uno de los empresarios más picudos de la historia.",
472
+ _resolve_example("es.wav"),
473
+ "我要给你一个从历史上最精明的商人之一约翰·洛克菲勒那里抄来的秘诀。",
474
+ ],
475
+ ["Nova, #1 dia 25 desse mês vai rolar operação the last Frontier.",
476
+ _resolve_example("pt.wav"),
477
+ "新消息,本月二十五日,'最后的边疆行动'将启动。",
478
+ ],
479
+ ],
480
+ inputs=[
481
+ ref_text,
482
+ ref_audio,
483
+ gen_text,
484
+ ],
485
+ outputs=[gen_audio, txt_info_gpu, seed_info],
486
+ fn=infer,
487
+ cache_examples=False
488
+ )
489
+
490
+ # System Info section at the bottom
491
+ gr.Markdown("---")
492
+ gr.Markdown("## System Information")
493
+ with gr.Accordion("Update System Stats", open=False):
494
+ update_button = gr.Button("Update System Stats", scale=1)
495
+ output_box = gr.Textbox(label="GPU and CPU Information", lines=5, scale=5)
496
+
497
+ def update_stats():
498
+ return get_combined_stats()
499
+
500
+
501
+ denoise_btn.click(fn=denoise,
502
+ inputs=[ref_audio],
503
+ outputs=[denoise_audio])
504
+
505
+ cancel_btn.click(fn=cancel_denoise,
506
+ inputs=[ref_audio],
507
+ outputs=[denoise_audio])
508
+
509
+ # Event handlers
510
+ check_button_infer.click(
511
+ fn=infer,
512
+ inputs=[
513
+ cm_project,
514
+ cm_checkpoint,
515
+ exp_name,
516
+ ref_text,
517
+ ref_audio,
518
+ denoise_audio,
519
+ gen_text,
520
+ nfe_step,
521
+ ch_use_ema,
522
+ separate_langs,
523
+ frontend,
524
+ speed,
525
+ cfg_strength,
526
+ use_acc_grl,
527
+ ref_ratio,
528
+ no_ref_audio,
529
+ sway_sampling_coef,
530
+ use_prosody_encoder,
531
+ seed,
532
+ ],
533
+ outputs=[gen_audio, txt_info_gpu, seed_info],
534
+ )
535
+
536
+ bt_checkpoint_refresh.click(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
537
+ cm_project.change(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
538
+
539
+ ref_audio.change(
540
+ fn=lambda x: None,
541
+ inputs=[ref_audio],
542
+ outputs=[denoise_audio]
543
+ )
544
+
545
+ update_button.click(fn=update_stats, outputs=output_box)
546
+
547
+ # Auto-load system stats on startup
548
+ app.load(fn=update_stats, outputs=output_box)
549
+
550
+
551
+ @click.command()
552
+ @click.option("--port", "-p", default=7860, type=int, help="Port to run the app on")
553
+ @click.option("--host", "-H", default="0.0.0.0", help="Host to run the app on")
554
+ @click.option(
555
+ "--share",
556
+ "-s",
557
+ default=False,
558
+ is_flag=True,
559
+ help="Share the app via Gradio share link",
560
+ )
561
+ @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
562
+ def main(port, host, share, api):
563
+ global app
564
+ print("Starting LEMAS-TTS Inference Interface...")
565
+ print(f"Device: {device}")
566
+ app.queue(api_open=api).launch(
567
+ server_name=host,
568
+ server_port=port,
569
+ share=share,
570
+ show_api=api,
571
+ allowed_paths=[str(Path(PRETRAINED_ROOT) / "data")],
572
+ )
573
+
574
+
575
+ if __name__ == "__main__":
576
+ main()
lemas_tts/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .api import TTS
2
+
3
+ __all__ = ["TTS"]
4
+
5
+ __version__ = "0.1.0"
6
+
lemas_tts/api.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import sys
4
+ from pathlib import Path
5
+ import re, regex
6
+ import soundfile as sf
7
+ import tqdm
8
+ from hydra.utils import get_class
9
+ from omegaconf import OmegaConf
10
+
11
+ from lemas_tts.infer.utils_infer import (
12
+ load_model,
13
+ load_vocoder,
14
+ transcribe,
15
+ preprocess_ref_audio_text,
16
+ infer_process,
17
+ remove_silence_for_generated_wav,
18
+ save_spectrogram,
19
+ )
20
+ from lemas_tts.model.utils import seed_everything
21
+ from lemas_tts.model.backbones.dit import DiT
22
+
23
+
24
+ # Resolve repository layout so we can find pretrained assets (ckpts, vocoder, etc.)
25
+ THIS_FILE = Path(__file__).resolve()
26
+ print("THIS_FILE:", THIS_FILE)
27
+
28
+ def _find_repo_root(start: Path) -> Path:
29
+ """Locate the repo root by looking for a `pretrained_models` folder upwards."""
30
+ for p in [start, *start.parents]:
31
+ if (p / "pretrained_models").is_dir():
32
+ return p
33
+ cwd = Path.cwd()
34
+ if (cwd / "pretrained_models").is_dir():
35
+ return cwd
36
+ return start
37
+
38
+
39
+ def _find_pretrained_root(start: Path) -> Path:
40
+ """
41
+ Locate the `pretrained_models` root, with support for:
42
+ 1) Explicit env override (LEMAS_PRETRAINED_ROOT)
43
+ 2) Hugging Face Spaces model mount under /models
44
+ 3) Local source tree (searching upwards from this file)
45
+ """
46
+ # 1) Explicit override
47
+ env_root = os.environ.get("LEMAS_PRETRAINED_ROOT")
48
+ if env_root:
49
+ p = Path(env_root)
50
+ if p.is_dir():
51
+ return p
52
+
53
+ # 2) HF Spaces model mount: /models/<model_id>/pretrained_models
54
+ models_dir = Path("/models")
55
+ if models_dir.is_dir():
56
+ # Try the expected model name first
57
+ specific = models_dir / "LEMAS-Project__LEMAS-TTS"
58
+ if (specific / "pretrained_models").is_dir():
59
+ return specific / "pretrained_models"
60
+ # Otherwise, pick the first model that has a pretrained_models subdir
61
+ for child in models_dir.iterdir():
62
+ if child.is_dir() and (child / "pretrained_models").is_dir():
63
+ return child / "pretrained_models"
64
+
65
+ # 3) Local repo layout
66
+ repo_root = _find_repo_root(start)
67
+ if (repo_root / "pretrained_models").is_dir():
68
+ return repo_root / "pretrained_models"
69
+
70
+ cwd = Path.cwd()
71
+ if (cwd / "pretrained_models").is_dir():
72
+ return cwd / "pretrained_models"
73
+
74
+ # Fallback: assume under repo root even if directory is missing
75
+ return repo_root / "pretrained_models"
76
+
77
+
78
+ REPO_ROOT = _find_repo_root(THIS_FILE)
79
+ PRETRAINED_ROOT = _find_pretrained_root(THIS_FILE)
80
+ CKPTS_ROOT = PRETRAINED_ROOT / "ckpts"
81
+
82
+ class TTS:
83
+ def __init__(
84
+ self,
85
+ model="multilingual",
86
+ ckpt_file="",
87
+ vocab_file="",
88
+ use_prosody_encoder=False,
89
+ prosody_cfg_path="",
90
+ prosody_ckpt_path="",
91
+ ode_method="euler",
92
+ use_ema=False,
93
+ vocoder_local_path=str(CKPTS_ROOT / "vocos-mel-24khz"),
94
+ device=None,
95
+ hf_cache_dir=None,
96
+ frontend="phone",
97
+ ):
98
+ # Load model architecture config from bundled yaml
99
+ config_dir = THIS_FILE.parent / "configs"
100
+ model_cfg = OmegaConf.load(config_dir / f"{model}.yaml")
101
+ # model_cls = get_class(f"lemas_tts.model.dit.{model_cfg.model.backbone}")
102
+ model_arc = model_cfg.model.arch
103
+
104
+ self.mel_spec_type = model_cfg.model.mel_spec.mel_spec_type
105
+ self.target_sample_rate = model_cfg.model.mel_spec.target_sample_rate
106
+
107
+ self.ode_method = ode_method
108
+ self.use_ema = use_ema
109
+ # remember whether this TTS instance is configured with a prosody encoder
110
+ self.use_prosody_encoder = use_prosody_encoder
111
+ self.langs = {"cmn":"zh", "zh":"zh", "en":"en-us", "it":"it", "es":"es", "pt":"pt-br", "fr":"fr-fr", "de":"de", "ru":"ru", "id":"id", "vi":"vi", "th":"th"}
112
+
113
+ if device is not None:
114
+ self.device = device
115
+ else:
116
+ import torch
117
+
118
+ self.device = (
119
+ "cuda"
120
+ if torch.cuda.is_available()
121
+ else "xpu"
122
+ if torch.xpu.is_available()
123
+ else "mps"
124
+ if torch.backends.mps.is_available()
125
+ else "cpu"
126
+ )
127
+
128
+ # # Load models
129
+ # Prefer local vocoder directory if it exists; otherwise let `load_vocoder`
130
+ # fall back to downloading from the default HF repo (charactr/vocos-mel-24khz).
131
+ vocoder_is_local = False
132
+ if vocoder_local_path is not None:
133
+ try:
134
+ vocoder_is_local = Path(vocoder_local_path).is_dir()
135
+ except TypeError:
136
+ vocoder_is_local = False
137
+
138
+ self.vocoder = load_vocoder(
139
+ self.mel_spec_type, vocoder_is_local, vocoder_local_path, self.device, hf_cache_dir
140
+ )
141
+ # self.vocoder = load_vocoder(vocoder_name="vocos", is_local=True, local_path=vocoder_local_path, device=self.device)
142
+ if frontend is not None:
143
+ from lemas_tts.infer.frontend import TextNorm
144
+ # try:
145
+ # Try requested frontend first (typically "phone")
146
+ self.frontend = TextNorm(dtype=frontend)
147
+ # except Exception as e:
148
+ # # If espeak/phonemizer is not available, gracefully fall back to char frontend
149
+ # print(f"[TTS] Failed to init TextNorm with dtype='{frontend}': {e}")
150
+ # print("[TTS] Falling back to char frontend (no espeak required).")
151
+ # self.frontend = TextNorm(dtype="char")
152
+ else:
153
+ self.frontend = None
154
+
155
+
156
+ self.ema_model = load_model(
157
+ DiT,
158
+ model_arc,
159
+ ckpt_file,
160
+ self.mel_spec_type,
161
+ vocab_file,
162
+ self.ode_method,
163
+ self.use_ema,
164
+ self.device,
165
+ use_prosody_encoder=use_prosody_encoder,
166
+ prosody_cfg_path=prosody_cfg_path,
167
+ prosody_ckpt_path=prosody_ckpt_path,
168
+ )
169
+
170
+ def transcribe(self, ref_audio, language=None):
171
+ return transcribe(ref_audio, language)
172
+
173
+ def export_wav(self, wav, file_wave, remove_silence=False):
174
+ sf.write(file_wave, wav, self.target_sample_rate)
175
+
176
+ if remove_silence:
177
+ remove_silence_for_generated_wav(file_wave)
178
+
179
+ def export_spectrogram(self, spec, file_spec):
180
+ save_spectrogram(spec, file_spec)
181
+
182
+ def infer(
183
+ self,
184
+ ref_file,
185
+ ref_text,
186
+ gen_text,
187
+ show_info=print,
188
+ progress=tqdm,
189
+ target_rms=0.1,
190
+ cross_fade_duration=0.15,
191
+ use_acc_grl=False,
192
+ ref_ratio=None,
193
+ no_ref_audio=False,
194
+ cfg_strength=2,
195
+ nfe_step=32,
196
+ speed=1.0,
197
+ sway_sampling_coef=5,
198
+ separate_langs=False,
199
+ fix_duration=None,
200
+ use_prosody_encoder=True,
201
+ file_wave=None,
202
+ file_spec=None,
203
+ seed=None,
204
+ ):
205
+ if seed is None:
206
+ seed = random.randint(0, sys.maxsize)
207
+ seed_everything(seed)
208
+ self.seed = seed
209
+
210
+ ref_file, ref_text = preprocess_ref_audio_text(ref_file, ref_text)
211
+ print("preprocesss:\n", "ref_file:", ref_file, "\nref_text:", ref_text)
212
+ if self.frontend.dtype == "phone":
213
+ ref_text = self.frontend.text2phn(ref_text+". ").replace("(cmn)", "(zh)").split("|")
214
+ gen_text = gen_text.split("\n")
215
+ gen_text = [self.frontend.text2phn(x+". ").replace("(cmn)", "(zh)").split("|") for x in gen_text]
216
+
217
+ elif self.frontend.dtype == "char":
218
+ src_lang, ref_text = self.frontend.text2norm(ref_text+". ")
219
+ ref_text = ["("+src_lang.replace("cmn", "zh")+")"] + list(ref_text)
220
+ gen_text = gen_text.split("\n")
221
+ gen_text = [self.frontend.text2norm(x+". ") for x in gen_text]
222
+ gen_text = [["("+x[0].replace("cmn", "zh")+")"] + list(x[1]) for x in gen_text]
223
+ print("after frontend:\n", "ref_text:", ref_text, "\ngen_text:", gen_text)
224
+
225
+ if separate_langs:
226
+ ref_text = self.process_phone_list(ref_text) # Optional
227
+ gen_text = [self.process_phone_list(x) for x in gen_text]
228
+
229
+ print("gen_text:", gen_text, "\nref_text:", ref_text)
230
+
231
+ wav, sr, spec = infer_process(
232
+ ref_file,
233
+ ref_text,
234
+ gen_text,
235
+ self.ema_model,
236
+ self.vocoder,
237
+ self.mel_spec_type,
238
+ show_info=show_info,
239
+ progress=progress,
240
+ target_rms=target_rms,
241
+ cross_fade_duration=cross_fade_duration,
242
+ nfe_step=nfe_step,
243
+ cfg_strength=cfg_strength,
244
+ sway_sampling_coef=sway_sampling_coef,
245
+ use_prosody_encoder=use_prosody_encoder,
246
+ use_acc_grl=use_acc_grl,
247
+ ref_ratio=ref_ratio,
248
+ no_ref_audio=no_ref_audio,
249
+ speed=speed,
250
+ fix_duration=fix_duration,
251
+ device=self.device,
252
+ )
253
+
254
+ if file_wave is not None:
255
+ self.export_wav(wav, file_wave, remove_silence=False)
256
+
257
+ if file_spec is not None:
258
+ self.export_spectrogram(spec, file_spec)
259
+
260
+ return wav, sr, spec
261
+
262
+
263
+ def process_phone_list(self, parts):
264
+ puncs = {"#1", "#2", "#3", "#4", "_", "!", ",", ".", "?", '"', "'", "^", "。", ",", "?", "!"}
265
+ """(vocab756 ver)处理phone list,给不带language id的phone添加当前language id前缀"""
266
+ # parts = phn_str.split('|')
267
+ processed = []
268
+ current_lang = ""
269
+ for i in range(len(parts)):
270
+ part = parts[i]
271
+ if part.startswith('(') and part.endswith(')') and part[1:-1] in self.langs:
272
+ # 这是一个language id
273
+ current_lang = part
274
+ # processed.append(part)
275
+ elif part in puncs: # not bool(regex.search(r'\p{L}', part[0])): # 匹配非字母数字、非空格的字符
276
+ # 是停顿符或标点
277
+ if len(processed) > 0 and processed[-1] == "_":
278
+ processed.pop()
279
+ elif len(processed) > 0 and processed[-1] in puncs and part == "_":
280
+ continue
281
+ processed.append(part)
282
+ # if i < len(parts) - 1 and parts[i+1] != "_":
283
+ # processed.append("_")
284
+ elif current_lang is not None:
285
+ # 不是language id且有当前language id,添加前缀
286
+ processed.append(f"{current_lang}{part}")
287
+ return processed
288
+
289
+
290
+ if __name__ == "__main__":
291
+ f5tts = F5TTS()
292
+
293
+ wav, sr, spec = f5tts.infer(
294
+ ref_file=str((THIS_FILE.parent / "infer" / "examples" / "basic" / "basic_ref_en.wav").resolve()),
295
+ ref_text="some call me nature, others call me mother nature.",
296
+ gen_text=(
297
+ "I don't really care what you call me. I've been a silent spectator, watching species evolve, "
298
+ "empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture "
299
+ "you; ignore me and you shall face the consequences."
300
+ ),
301
+ file_wave=str((REPO_ROOT / "outputs" / "api_out.wav").resolve()),
302
+ file_spec=str((REPO_ROOT / "outputs" / "api_out.png").resolve()),
303
+ seed=None,
304
+ )
305
+
306
+ print("seed :", f5tts.seed)
lemas_tts/configs/multilingual_grl.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # compute_environment: LOCAL_MACHINE
2
+ # debug: false
3
+ # distributed_type: MULTI_GPU
4
+ # downcast_bf16: 'no'
5
+ # enable_cpu_affinity: true
6
+ # gpu_ids: all
7
+ # # machine_rank: 0
8
+ # # main_training_function: main
9
+ # mixed_precision: bf16
10
+ # num_machines: 1
11
+ # num_processes: 16
12
+ # # rdzv_backend: static
13
+ # same_network: true
14
+ # use_cpu: false
15
+
16
+
17
+ hydra:
18
+ run:
19
+ dir: exp/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
20
+
21
+ datasets:
22
+ name: multilingual_vocab898_acc_grl_ctc_fix # dataset name
23
+ batch_size_per_gpu: 40000 # 8 GPUs, 8 * 38400 = 307200
24
+ batch_size_type: frame # frame | sample
25
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
26
+ num_workers: 2
27
+ separate_langs: True
28
+
29
+ optim:
30
+ epochs: 100
31
+ learning_rate: 2e-5
32
+ num_warmup_updates: 1000 # warmup updates
33
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
34
+ max_grad_norm: 1.0 # gradient clipping
35
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
36
+ model:
37
+ name: multilingual # model name
38
+ tokenizer: custom # tokenizer type
39
+ tokenizer_path: "pretrained_models/data/multilingual_grl/vocab.txt" # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
40
+ audio_dir: "pretrained_models/data/multilingual_grl"
41
+ use_ctc_loss: True # whether to use ctc loss
42
+ use_spk_enc: False
43
+ use_prosody_encoder: False
44
+ prosody_cfg_path: "pretrained_models/ckpts/prosody_encoder/pretssel_cfg.json" # pretssel_cfg.json
45
+ prosody_ckpt_path: "pretrained_models/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt" # prosody_encoder_pretssel.pt
46
+
47
+ backbone: DiT
48
+ arch:
49
+ dim: 1024
50
+ depth: 22
51
+ heads: 16
52
+ ff_mult: 2
53
+ text_dim: 512
54
+ text_mask_padding: True
55
+ qk_norm: null # null | rms_norm
56
+ conv_layers: 4
57
+ pe_attn_head: null
58
+ checkpoint_activations: False # recompute activations and save memory for extra compute
59
+ mel_spec:
60
+ target_sample_rate: 24000
61
+ n_mel_channels: 100
62
+ hop_length: 256
63
+ win_length: 1024
64
+ n_fft: 1024
65
+ mel_spec_type: vocos # vocos | bigvgan
66
+ vocoder:
67
+ is_local: True # use local offline ckpt or not
68
+ # Path in the original training environment; kept here for reference only.
69
+ # For the open-sourced LEMAS-TTS repo, use `pretrained_models/ckpts/vocos-mel-24khz`.
70
+ local_path: "pretrained_models/ckpts/vocos-mel-24khz" # local vocoder path
71
+
72
+ ckpts:
73
+ logger: tensorboard # wandb | tensorboard | null
74
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
75
+ save_per_updates: 1000 # save checkpoint per updates
76
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
77
+ last_per_updates: 1000 # save last checkpoint per updates
78
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
lemas_tts/configs/multilingual_prosody.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # compute_environment: LOCAL_MACHINE
2
+ # debug: false
3
+ # distributed_type: MULTI_GPU
4
+ # downcast_bf16: 'no'
5
+ # enable_cpu_affinity: true
6
+ # gpu_ids: all
7
+ # # machine_rank: 0
8
+ # # main_training_function: main
9
+ # mixed_precision: bf16
10
+ # num_machines: 1
11
+ # num_processes: 16
12
+ # # rdzv_backend: static
13
+ # same_network: true
14
+ # use_cpu: false
15
+
16
+
17
+ hydra:
18
+ run:
19
+ dir: exp/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
20
+
21
+ datasets:
22
+ name: multilingual_vocab898_acc_grl_prosody_ctc_fix # dataset name
23
+ batch_size_per_gpu: 40000 # 8 GPUs, 8 * 38400 = 307200
24
+ batch_size_type: frame # frame | sample
25
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
26
+ num_workers: 2
27
+ separate_langs: True
28
+
29
+ optim:
30
+ epochs: 100
31
+ learning_rate: 2e-5
32
+ num_warmup_updates: 1000 # warmup updates
33
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
34
+ max_grad_norm: 1.0 # gradient clipping
35
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
36
+ model:
37
+ name: multilingual # model name
38
+ tokenizer: custom # tokenizer type
39
+ tokenizer_path: "pretrained_models/data/multilingual_grl/vocab.txt" # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
40
+ audio_dir: "pretrained_models/data/multilingual_grl"
41
+ use_ctc_loss: True # whether to use ctc loss
42
+ use_spk_enc: False
43
+ use_prosody_encoder: True
44
+ prosody_cfg_path: "pretrained_models/ckpts/prosody_encoder/pretssel_cfg.json" # pretssel_cfg.json
45
+ prosody_ckpt_path: "pretrained_models/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt" # prosody_encoder_pretssel.pt
46
+
47
+ backbone: DiT
48
+ arch:
49
+ dim: 1024
50
+ depth: 22
51
+ heads: 16
52
+ ff_mult: 2
53
+ text_dim: 512
54
+ text_mask_padding: True
55
+ qk_norm: null # null | rms_norm
56
+ conv_layers: 4
57
+ pe_attn_head: null
58
+ checkpoint_activations: False # recompute activations and save memory for extra compute
59
+ mel_spec:
60
+ target_sample_rate: 24000
61
+ n_mel_channels: 100
62
+ hop_length: 256
63
+ win_length: 1024
64
+ n_fft: 1024
65
+ mel_spec_type: vocos # vocos | bigvgan
66
+ vocoder:
67
+ is_local: True # use local offline ckpt or not
68
+ # Path in the original training environment; kept here for reference only.
69
+ # For the open-sourced LEMAS-TTS repo, use `pretrained_models/ckpts/vocos-mel-24khz`.
70
+ local_path: "pretrained_models/ckpts/vocos-mel-24khz" # local vocoder path
71
+
72
+ ckpts:
73
+ logger: tensorboard # wandb | tensorboard | null
74
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
75
+ save_per_updates: 1000 # save checkpoint per updates
76
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
77
+ last_per_updates: 1000 # save last checkpoint per updates
78
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
lemas_tts/infer/edit_multilingual.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Multilingual speech editing helpers for LEMAS-TTS.
3
+
4
+ This is adapted from F5-TTS's `speech_edit_multilingual.py`, but uses the
5
+ `lemas_tts.api.TTS` API instead of `F5TTS`.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import List, Tuple
11
+
12
+ import torch
13
+ import torch.nn.functional as F
14
+ import torchaudio
15
+
16
+ from lemas_tts.api import TTS
17
+
18
+
19
+ def build_tokens_from_text(tts: TTS, text: str) -> List[List[str]]:
20
+ """
21
+ Convert raw text into token sequence(s) consistent with the multilingual
22
+ LEMAS-TTS training pipeline.
23
+
24
+ We reuse the same frontend logic as in `TTS.infer`:
25
+ - frontend.dtype == "phone" -> TextNorm.text2phn -> split on '|'
26
+ - frontend.dtype == "char" -> TextNorm.text2norm -> language tag + chars
27
+ - frontend is None -> simple character sequence as fallback.
28
+ """
29
+ text_proc = text.strip()
30
+ if not text_proc.endswith((".", "。", "!", "?", "?", "!")):
31
+ text_proc = text_proc + "."
32
+
33
+ if getattr(tts, "frontend", None) is None:
34
+ tokens = list(text_proc)
35
+ return [tokens]
36
+
37
+ dtype = getattr(tts.frontend, "dtype", "phone")
38
+
39
+ if dtype == "phone":
40
+ phones = tts.frontend.text2phn(text_proc + " ")
41
+ phones = phones.replace("(cmn)", "(zh)")
42
+ tokens = [tok for tok in phones.split("|") if tok]
43
+ return [tokens]
44
+
45
+ if dtype == "char":
46
+ lang, norm = tts.frontend.text2norm(text_proc + " ")
47
+ lang_tag = f"({lang.replace('cmn', 'zh')})"
48
+ tokens = [lang_tag] + list(norm)
49
+ return [tokens]
50
+
51
+ # Fallback: character-level
52
+ tokens = list(text_proc)
53
+ return [tokens]
54
+
55
+
56
+ def gen_wav_multilingual(
57
+ tts: TTS,
58
+ segment_audio: torch.Tensor,
59
+ sr: int,
60
+ target_text: str,
61
+ parts_to_edit: List[Tuple[float, float]],
62
+ nfe_step: int = 64,
63
+ cfg_strength: float = 5.0,
64
+ sway_sampling_coef: float = 3.0,
65
+ ref_ratio: float = 1.0,
66
+ no_ref_audio: bool = False,
67
+ use_acc_grl: bool = False,
68
+ use_prosody_encoder_flag: bool = False,
69
+ seed: int | None = None,
70
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
71
+ """
72
+ Core editing routine:
73
+ - build an edit mask over the mel frames;
74
+ - run CFM.sample with that mask and the new text;
75
+ - decode mel to waveform via the vocoder.
76
+ """
77
+ device = tts.device
78
+ model = tts.ema_model
79
+ vocoder = tts.vocoder
80
+
81
+ mel_spec = getattr(model, "mel_spec", None)
82
+ if mel_spec is None:
83
+ raise RuntimeError("CFM model has no attached MelSpec; check your checkpoint.")
84
+
85
+ target_sr = int(mel_spec.target_sample_rate)
86
+ hop_length = int(mel_spec.hop_length)
87
+ target_rms = 0.1
88
+
89
+ if segment_audio.dim() == 1:
90
+ audio = segment_audio.unsqueeze(0)
91
+ else:
92
+ audio = segment_audio
93
+
94
+ # RMS normalization
95
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
96
+ if rms < target_rms:
97
+ audio = audio * target_rms / rms
98
+
99
+ # Resample if needed
100
+ if sr != target_sr:
101
+ resampler = torchaudio.transforms.Resample(sr, target_sr)
102
+ audio = resampler(audio)
103
+
104
+ audio = audio.to(device)
105
+
106
+ # Build edit mask over mel frames
107
+ offset = 0.0
108
+ edit_mask = torch.zeros(1, 0, dtype=torch.bool, device=device)
109
+ for (start, end) in parts_to_edit:
110
+ # small safety margin around the region to edit
111
+ start = max(start - 0.1, 0.0)
112
+ end = min(end + 0.1, audio.shape[-1] / target_sr)
113
+ part_dur_sec = end - start
114
+ part_dur_samples = int(round(part_dur_sec * target_sr))
115
+ start_samples = int(round(start * target_sr))
116
+
117
+ # frames before edited span: keep original (mask=True)
118
+ num_keep_frames = int(round((start_samples - offset) / hop_length))
119
+ # frames inside edited span: to be regenerated (mask=False)
120
+ num_edit_frames = int(round(part_dur_samples / hop_length))
121
+
122
+ if num_keep_frames > 0:
123
+ edit_mask = torch.cat(
124
+ [edit_mask, torch.ones(1, num_keep_frames, dtype=torch.bool, device=device)],
125
+ dim=-1,
126
+ )
127
+ if num_edit_frames > 0:
128
+ edit_mask = torch.cat(
129
+ [edit_mask, torch.zeros(1, num_edit_frames, dtype=torch.bool, device=device)],
130
+ dim=-1,
131
+ )
132
+
133
+ offset = end * target_sr
134
+
135
+ # Pad mask to full sequence length (True = keep original)
136
+ total_frames = audio.shape[-1] // hop_length
137
+ if edit_mask.shape[-1] < total_frames + 1:
138
+ pad_len = total_frames + 1 - edit_mask.shape[-1]
139
+ edit_mask = F.pad(edit_mask, (0, pad_len), value=True)
140
+
141
+ duration = total_frames
142
+
143
+ # Text tokens using multilingual frontend
144
+ final_text_list = build_tokens_from_text(tts, target_text)
145
+
146
+ # For multilingual models trained with `separate_langs=True`, we need to
147
+ # post-process the phone sequence so that each non-punctuation token is
148
+ # prefixed with its language id, consistent with training and the main API.
149
+ if hasattr(tts, "process_phone_list") and len(final_text_list) > 0:
150
+ final_text_list = [tts.process_phone_list(final_text_list[0])]
151
+ print("final_text_list:", final_text_list)
152
+
153
+ with torch.inference_mode():
154
+ generated, _ = model.sample(
155
+ cond=audio,
156
+ text=final_text_list,
157
+ duration=duration,
158
+ steps=nfe_step,
159
+ cfg_strength=cfg_strength,
160
+ sway_sampling_coef=sway_sampling_coef,
161
+ seed=seed,
162
+ edit_mask=edit_mask,
163
+ use_acc_grl=use_acc_grl,
164
+ use_prosody_encoder=use_prosody_encoder_flag,
165
+ ref_ratio=ref_ratio,
166
+ no_ref_audio=no_ref_audio,
167
+ )
168
+
169
+ generated = generated.to(torch.float32)
170
+ generated_mel = generated.permute(0, 2, 1) # [B, C, T_mel]
171
+
172
+ mel_for_vocoder = generated_mel.to(device)
173
+ if tts.mel_spec_type == "vocos":
174
+ wav_out = vocoder.decode(mel_for_vocoder)
175
+ elif tts.mel_spec_type == "bigvgan":
176
+ wav_out = vocoder(mel_for_vocoder)
177
+ else:
178
+ raise ValueError(f"Unsupported vocoder type: {tts.mel_spec_type}")
179
+
180
+ if rms < target_rms:
181
+ wav_out = wav_out * rms / target_rms
182
+
183
+ return wav_out.squeeze(0), generated_mel
184
+
lemas_tts/infer/frontend.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, re, regex
2
+ import langid
3
+ import uroman as ur
4
+ import jieba, zhconv
5
+ from num2words import num2words
6
+
7
+ jieba.set_dictionary(dictionary_path=os.path.join(os.path.dirname(__file__) + "/../infer/text_norm/jieba_dict.txt"))
8
+ # from pypinyin.core import Pinyin
9
+ from pypinyin import pinyin, lazy_pinyin, Style
10
+
11
+ from .text_norm.txt2pinyin import _PAUSE_SYMBOL, get_phoneme_from_char_and_pinyin
12
+ from .text_norm.cn_tn import NSWNormalizer
13
+ from .text_norm.tokenizer import TextTokenizer, txt2phone
14
+ from pypinyin.contrib.tone_convert import to_initials, to_finals_tone3
15
+ from pypinyin_dict.phrase_pinyin_data import large_pinyin # large_pinyin # cc_cedict
16
+ large_pinyin.load()
17
+
18
+ class TextNorm():
19
+ def __init__(self, dtype="phone"):
20
+ # my_pinyin = Pinyin(MyConverter())
21
+ # self.pinyin_parser = my_pinyin.pinyin
22
+ cmn_lexicon = open(os.path.join(os.path.dirname(__file__)+'/../infer/text_norm/pinyin-lexicon-r.txt'),'r', encoding="utf-8").readlines()
23
+ cmn_lexicon = [x.strip().split() for x in cmn_lexicon]
24
+ self.cmn_dict = {x[0]:x[1:] for x in cmn_lexicon}
25
+ langid.set_languages(['es','pt','zh','en','de','fr','it','ru', 'vi','id','th','ja','ko','ar'])
26
+ langs = {"en":"en-us", "it":"it", "es":"es", "pt":"pt-br", "fr":"fr-fr", "de":"de", "ru":"ru", "vi":"vi", "id":"id", "th":"th", "ja":"ja", "ko":"ko"} # "zh":"cmn", "cmn":"cmn", "ar":"ar-sa"}
27
+ text_tokenizer = {}
28
+ for k,v in langs.items():
29
+ tokenizer = TextTokenizer(language=v, backend="espeak")
30
+ lang = "zh" if k == "cmn" else k
31
+ text_tokenizer[k] = (lang, tokenizer)
32
+ self.text_tokenizer = text_tokenizer
33
+ self.cn_tn = NSWNormalizer()
34
+ self.dtype = dtype
35
+
36
+ def detect_lang(self, text):
37
+ lang, _ = langid.classify(text)[0]
38
+ return lang
39
+
40
+ def sil_type(self, time_s):
41
+ if round(time_s) < 0.4:
42
+ return ""
43
+ elif round(time_s) >= 0.4 and round(time_s) < 0.8:
44
+ return "#1"
45
+ elif round(time_s) >= 0.8 and round(time_s) < 1.5:
46
+ return "#2"
47
+ elif round(time_s) >= 1.5 and round(time_s) < 3.0:
48
+ return "#3"
49
+ elif round(time_s) >= 3.0:
50
+ return "#4"
51
+
52
+
53
+ def add_sil_raw(self, sub_list, start_time, end_time, target_transcript):
54
+ txt = []
55
+ txt_list = [x["word"] for x in sub_list]
56
+ sil = self.sil_type(sub_list[0]["start"])
57
+ if len(sil) > 0:
58
+ txt.append(sil)
59
+ txt.append(txt_list[0])
60
+ for i in range(1, len(sub_list)):
61
+ if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
62
+ txt.append(target_transcript)
63
+ target_transcript = ""
64
+ else:
65
+ sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
66
+ if len(sil) > 0:
67
+ txt.append(sil)
68
+ txt.append(txt_list[i])
69
+ return ' '.join(txt)
70
+
71
+ def add_sil(self, sub_list, start_time, end_time, target_transcript, src_lang, tar_lang):
72
+ txts = []
73
+ txt_list = [x["word"] for x in sub_list]
74
+ sil = self.sil_type(sub_list[0]["start"])
75
+ if len(sil) > 0:
76
+ txts.append([src_lang, sil])
77
+
78
+ if sub_list[0]["start"] < start_time:
79
+ txts.append([src_lang, txt_list[0]])
80
+ for i in range(1, len(sub_list)):
81
+ if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
82
+ txts.append([tar_lang, target_transcript])
83
+ target_transcript = ""
84
+ else:
85
+ sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
86
+ if len(sil) > 0:
87
+ txts.append([src_lang, sil])
88
+ txts.append([src_lang, txt_list[i]])
89
+
90
+ target_txt = [txts[0]]
91
+ for txt in txts[1:]:
92
+ if txt[1] == "":
93
+ continue
94
+ if txt[0] != target_txt[-1][0]:
95
+ target_txt.append([txt[0], ""])
96
+ target_txt[-1][-1] += " " + txt[1]
97
+
98
+ return target_txt
99
+
100
+ def replace_numbers_with_words(self, sentence, lang="en"):
101
+ sentence = re.sub(r'(\d+)', r' \1 ', sentence) # add spaces around numbers
102
+
103
+ def replace_with_words(match):
104
+ num = match.group(0)
105
+ try:
106
+ return num2words(num, lang=lang) # Convert numbers to words
107
+ except:
108
+ return num # In case num2words fails (unlikely with digits but just to be safe)
109
+ return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
110
+
111
+
112
+ def get_prompt(self, sub_list, start_time, end_time, src_lang):
113
+ txts = []
114
+ txt_list = [x["word"] for x in sub_list]
115
+
116
+ if start_time <= sub_list[0]["start"]:
117
+ sil = self.sil_type(sub_list[0]["start"])
118
+ if len(sil) > 0:
119
+ txts.append([src_lang, sil])
120
+ txts.append([src_lang, txt_list[0]])
121
+
122
+ for i in range(1, len(sub_list)):
123
+ # if sub_list[i]["start"] <= start_time and sub_list[i]["end"] <= end_time:
124
+ # txts.append([tar_lang, target_transcript])
125
+ # target_transcript = ""
126
+ if sub_list[i]["start"] >= start_time and sub_list[i]["end"] <= end_time:
127
+ sil = self.sil_type(sub_list[i]["start"] - sub_list[i-1]["end"])
128
+ if len(sil) > 0:
129
+ txts.append([src_lang, sil])
130
+ txts.append([src_lang, txt_list[i]])
131
+
132
+ target_txt = [txts[0]]
133
+ for txt in txts[1:]:
134
+ if txt[1] == "":
135
+ continue
136
+ if txt[0] != target_txt[-1][0]:
137
+ target_txt.append([txt[0], ""])
138
+ target_txt[-1][-1] += " " + txt[1]
139
+ return target_txt
140
+
141
+
142
+ def txt2pinyin(self, text):
143
+ txts, phonemes = [], []
144
+ texts = re.split(r"(#\d)", text)
145
+ print("before norm: ", texts)
146
+ for text in texts:
147
+ if text in {'#1', '#2', '#3', '#4'}:
148
+ txts.append(text)
149
+ phonemes.append(text)
150
+ continue
151
+ text = self.cn_tn.normalize(text.strip())
152
+
153
+ text_list = list(jieba.cut(text))
154
+ print("jieba cut: ", text, text_list)
155
+ for words in text_list:
156
+ if words in _PAUSE_SYMBOL:
157
+ # phonemes[-1] += _PAUSE_SYMBOL[words]
158
+ phonemes.append(_PAUSE_SYMBOL[words])
159
+ # phonemes.append('#1')
160
+ txts[-1] += words
161
+ elif re.search("[\u4e00-\u9fa5]+", words):
162
+ # pinyin = self.pinyin_parser(words, style=Style.TONE3, errors="ignore")
163
+ pinyin = lazy_pinyin(words, style=Style.TONE3, tone_sandhi=True, neutral_tone_with_five=True)
164
+ new_pinyin = []
165
+ for x in pinyin:
166
+ x = "".join(x)
167
+ if "#" not in x:
168
+ new_pinyin.append(x)
169
+ else:
170
+ phonemes.append(words)
171
+ continue
172
+ # new_pinyin = change_tone_in_bu_or_yi(words, new_pinyin) if len(words)>1 and words[-1] not in {"一","不"} else new_pinyin
173
+ phoneme = get_phoneme_from_char_and_pinyin(words, new_pinyin)
174
+ phonemes += phoneme
175
+ txts += list(words)
176
+ elif re.search(r"[a-zA-Z]", words) or re.search(r"#[1-4]", words):
177
+ phonemes.append(words.upper())
178
+ txts.append(words.upper())
179
+ # phonemes.append("#1")
180
+ # phones = " ".join(phonemes)
181
+ return txts, phonemes
182
+
183
+
184
+ def txt2pin_phns(self, text):
185
+ text = re.sub(r'(?<! )(' + r'[^\w\s]' + r')', r' \1', text)
186
+ text = re.sub(r'\s+', ' ', text).strip()
187
+
188
+ # print(text.split(" "))
189
+ res_list = []
190
+ for txt in text.split(" "):
191
+ if txt in self.cmn_dict:
192
+ # res_list += ["(zh)" + x for x in self.cmn_dict[txt]]
193
+ res_list.append("(zh)")
194
+ res_list.append(to_initials(txt, strict=False))
195
+ res_list.append(to_finals_tone3(txt, neutral_tone_with_five=True))
196
+ elif txt == '':
197
+ continue
198
+ elif txt[0] in {"#1", "#2", "#3", "#4"} or not bool(regex.search(r'\p{L}', txt[0][0])):
199
+ if len(res_list) > 0 and res_list[-1] == "_":
200
+ res_list.pop()
201
+ res_list += [txt]
202
+ continue
203
+ else:
204
+ if len(res_list) > 0 and res_list[-1] == "_":
205
+ res_list.pop()
206
+ lang = langid.classify(txt)[0]
207
+ lang = lang if lang in self.text_tokenizer else "en"
208
+ tokenizer = self.text_tokenizer[lang][1]
209
+ ipa = tokenizer.backend.phonemize([txt], separator=tokenizer.separator, strip=True, njobs=1)
210
+ phns = ipa[0] if ipa[0][0] == "(" else f"({lang})_" + ipa[0]
211
+ res_list += phns.replace("_", "|_|").split("|")
212
+
213
+ # lang = phns.split(")")[0][1:]
214
+ # phns = phns[len(lang)+3:].replace("_", "|_|")
215
+ # phns = phns.split("|")
216
+ # for i in range(len(phns)):
217
+ # if phns[i] not in {"#1", "#2", "#3", "#4", "_", ",", ".", "?", "!"}:
218
+ # phns[i] = f"({lang})" + phns[i]
219
+ # res_list += phns
220
+ res_list.append("_")
221
+ res = "|".join(res_list)
222
+ res = re.sub(r'(\|_)+', '|_', res)
223
+ return res
224
+
225
+
226
+ def text2phn(self, sentence, lang=None):
227
+ if not lang:
228
+ lang = langid.classify(sentence)[0]
229
+ if re.search("[\u4e00-\u9fa5]+", sentence):
230
+ txts, phones = self.txt2pinyin(sentence)
231
+ transcript_norm = " ".join(phones)
232
+ phones = self.txt2pin_phns(transcript_norm) # IPA mix Pinyin
233
+ else:
234
+ transcript = self.replace_numbers_with_words(sentence, lang=lang).split(' ')
235
+ transcript_norm = sentence
236
+ # All IPA
237
+ phones = txt2phone(self.text_tokenizer[lang][1], transcript_norm.strip().replace(".", ",").replace("。", ","))
238
+ phones = f"({lang})|" + phones if phones[0] != "(" else phones
239
+ return phones
240
+
241
+
242
+ def text2norm(self, sentence, lang=None):
243
+ if not lang:
244
+ lang = langid.classify(sentence)[0]
245
+ if re.search("[\u4e00-\u9fa5]+", sentence):
246
+ txts, phones = self.txt2pinyin(sentence)
247
+ transcript_norm = " ".join(phones)
248
+ else:
249
+ transcript = self.replace_numbers_with_words(sentence, lang=lang).split(' ')
250
+ transcript_norm = sentence
251
+ return (lang, transcript_norm)
lemas_tts/infer/infer_cli.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import codecs
3
+ import os
4
+ import re
5
+ from datetime import datetime
6
+ from importlib.resources import files
7
+ from pathlib import Path
8
+
9
+ import numpy as np
10
+ import soundfile as sf
11
+ import tomli
12
+ from cached_path import cached_path
13
+ from hydra.utils import get_class
14
+ from omegaconf import OmegaConf
15
+
16
+ from lemas_tts.infer.utils_infer import (
17
+ mel_spec_type,
18
+ target_rms,
19
+ cross_fade_duration,
20
+ nfe_step,
21
+ cfg_strength,
22
+ sway_sampling_coef,
23
+ speed,
24
+ fix_duration,
25
+ device,
26
+ infer_process,
27
+ load_model,
28
+ load_vocoder,
29
+ preprocess_ref_audio_text,
30
+ remove_silence_for_generated_wav,
31
+ )
32
+
33
+ THIS_FILE = Path(__file__).resolve()
34
+
35
+
36
+ def _find_repo_root(start: Path) -> Path:
37
+ """Locate the repo root by looking for a `pretrained_models` folder upwards."""
38
+ for p in [start, *start.parents]:
39
+ if (p / "pretrained_models").is_dir():
40
+ return p
41
+ cwd = Path.cwd()
42
+ if (cwd / "pretrained_models").is_dir():
43
+ return cwd
44
+ return start
45
+
46
+
47
+ REPO_ROOT = _find_repo_root(THIS_FILE)
48
+ PRETRAINED_ROOT = REPO_ROOT / "pretrained_models"
49
+ CKPTS_ROOT = PRETRAINED_ROOT / "ckpts"
50
+
51
+
52
+ parser = argparse.ArgumentParser(
53
+ prog="python3 infer-cli.py",
54
+ description="Commandline interface for E2/F5 TTS with Advanced Batch Processing.",
55
+ epilog="Specify options above to override one or more settings from config.",
56
+ )
57
+ parser.add_argument(
58
+ "-c",
59
+ "--config",
60
+ type=str,
61
+ default=os.path.join(files("lemas_tts").joinpath("infer/examples/basic"), "basic.toml"),
62
+ help="The configuration file, default see infer/examples/basic/basic.toml",
63
+ )
64
+
65
+
66
+ # Note. Not to provide default value here in order to read default from config file
67
+
68
+ parser.add_argument(
69
+ "-m",
70
+ "--model",
71
+ type=str,
72
+ help="The model name: F5TTS_v1_Base | F5TTS_Base | E2TTS_Base | etc.",
73
+ )
74
+ parser.add_argument(
75
+ "-mc",
76
+ "--model_cfg",
77
+ type=str,
78
+ help="The path to F5-TTS model config file .yaml",
79
+ )
80
+ parser.add_argument(
81
+ "-p",
82
+ "--ckpt_file",
83
+ type=str,
84
+ help="The path to model checkpoint .pt, leave blank to use default",
85
+ )
86
+ parser.add_argument(
87
+ "-v",
88
+ "--vocab_file",
89
+ type=str,
90
+ help="The path to vocab file .txt, leave blank to use default",
91
+ )
92
+ parser.add_argument(
93
+ "-r",
94
+ "--ref_audio",
95
+ type=str,
96
+ help="The reference audio file.",
97
+ )
98
+ parser.add_argument(
99
+ "-s",
100
+ "--ref_text",
101
+ type=str,
102
+ help="The transcript/subtitle for the reference audio",
103
+ )
104
+ parser.add_argument(
105
+ "-t",
106
+ "--gen_text",
107
+ type=str,
108
+ help="The text to make model synthesize a speech",
109
+ )
110
+ parser.add_argument(
111
+ "-f",
112
+ "--gen_file",
113
+ type=str,
114
+ help="The file with text to generate, will ignore --gen_text",
115
+ )
116
+ parser.add_argument(
117
+ "-o",
118
+ "--output_dir",
119
+ type=str,
120
+ help="The path to output folder",
121
+ )
122
+ parser.add_argument(
123
+ "-w",
124
+ "--output_file",
125
+ type=str,
126
+ help="The name of output file",
127
+ )
128
+ parser.add_argument(
129
+ "--save_chunk",
130
+ action="store_true",
131
+ help="To save each audio chunks during inference",
132
+ )
133
+ parser.add_argument(
134
+ "--remove_silence",
135
+ action="store_true",
136
+ help="To remove long silence found in ouput",
137
+ )
138
+ parser.add_argument(
139
+ "--load_vocoder_from_local",
140
+ action="store_true",
141
+ help="To load vocoder from local dir, default to ../checkpoints/vocos-mel-24khz",
142
+ )
143
+ parser.add_argument(
144
+ "--vocoder_name",
145
+ type=str,
146
+ choices=["vocos", "bigvgan"],
147
+ help=f"Used vocoder name: vocos | bigvgan, default {mel_spec_type}",
148
+ )
149
+ parser.add_argument(
150
+ "--target_rms",
151
+ type=float,
152
+ help=f"Target output speech loudness normalization value, default {target_rms}",
153
+ )
154
+ parser.add_argument(
155
+ "--cross_fade_duration",
156
+ type=float,
157
+ help=f"Duration of cross-fade between audio segments in seconds, default {cross_fade_duration}",
158
+ )
159
+ parser.add_argument(
160
+ "--nfe_step",
161
+ type=int,
162
+ help=f"The number of function evaluation (denoising steps), default {nfe_step}",
163
+ )
164
+ parser.add_argument(
165
+ "--cfg_strength",
166
+ type=float,
167
+ help=f"Classifier-free guidance strength, default {cfg_strength}",
168
+ )
169
+ parser.add_argument(
170
+ "--sway_sampling_coef",
171
+ type=float,
172
+ help=f"Sway Sampling coefficient, default {sway_sampling_coef}",
173
+ )
174
+ parser.add_argument(
175
+ "--speed",
176
+ type=float,
177
+ help=f"The speed of the generated audio, default {speed}",
178
+ )
179
+ parser.add_argument(
180
+ "--fix_duration",
181
+ type=float,
182
+ help=f"Fix the total duration (ref and gen audios) in seconds, default {fix_duration}",
183
+ )
184
+ parser.add_argument(
185
+ "--device",
186
+ type=str,
187
+ help="Specify the device to run on",
188
+ )
189
+ args = parser.parse_args()
190
+
191
+
192
+ # config file
193
+
194
+ config = tomli.load(open(args.config, "rb"))
195
+
196
+
197
+ # command-line interface parameters
198
+
199
+ model = args.model or config.get("model", "F5TTS_v1_Base")
200
+ ckpt_file = args.ckpt_file or config.get("ckpt_file", "")
201
+ vocab_file = args.vocab_file or config.get("vocab_file", "")
202
+
203
+ ref_audio = args.ref_audio or config.get("ref_audio", "infer/examples/basic/basic_ref_en.wav")
204
+ ref_text = (
205
+ args.ref_text
206
+ if args.ref_text is not None
207
+ else config.get("ref_text", "Some call me nature, others call me mother nature.")
208
+ )
209
+ gen_text = args.gen_text or config.get("gen_text", "Here we generate something just for test.")
210
+ gen_file = args.gen_file or config.get("gen_file", "")
211
+
212
+ output_dir = args.output_dir or config.get("output_dir", "tests")
213
+ output_file = args.output_file or config.get(
214
+ "output_file", f"infer_cli_{datetime.now().strftime(r'%Y%m%d_%H%M%S')}.wav"
215
+ )
216
+
217
+ save_chunk = args.save_chunk or config.get("save_chunk", False)
218
+ remove_silence = args.remove_silence or config.get("remove_silence", False)
219
+ load_vocoder_from_local = args.load_vocoder_from_local or config.get("load_vocoder_from_local", False)
220
+
221
+ vocoder_name = args.vocoder_name or config.get("vocoder_name", mel_spec_type)
222
+ target_rms = args.target_rms or config.get("target_rms", target_rms)
223
+ cross_fade_duration = args.cross_fade_duration or config.get("cross_fade_duration", cross_fade_duration)
224
+ nfe_step = args.nfe_step or config.get("nfe_step", nfe_step)
225
+ cfg_strength = args.cfg_strength or config.get("cfg_strength", cfg_strength)
226
+ sway_sampling_coef = args.sway_sampling_coef or config.get("sway_sampling_coef", sway_sampling_coef)
227
+ speed = args.speed or config.get("speed", speed)
228
+ fix_duration = args.fix_duration or config.get("fix_duration", fix_duration)
229
+ device = args.device or config.get("device", device)
230
+
231
+
232
+ # patches for pip pkg user
233
+ if "infer/examples/" in ref_audio:
234
+ ref_audio = str(files("lemas_tts").joinpath(f"{ref_audio}"))
235
+ if "infer/examples/" in gen_file:
236
+ gen_file = str(files("lemas_tts").joinpath(f"{gen_file}"))
237
+ if "voices" in config:
238
+ for voice in config["voices"]:
239
+ voice_ref_audio = config["voices"][voice]["ref_audio"]
240
+ if "infer/examples/" in voice_ref_audio:
241
+ config["voices"][voice]["ref_audio"] = str(files("lemas_tts").joinpath(f"{voice_ref_audio}"))
242
+
243
+
244
+ # ignore gen_text if gen_file provided
245
+
246
+ if gen_file:
247
+ gen_text = codecs.open(gen_file, "r", "utf-8").read()
248
+
249
+
250
+ # output path
251
+
252
+ wave_path = Path(output_dir) / output_file
253
+ # spectrogram_path = Path(output_dir) / "infer_cli_out.png"
254
+ if save_chunk:
255
+ output_chunk_dir = os.path.join(output_dir, f"{Path(output_file).stem}_chunks")
256
+ if not os.path.exists(output_chunk_dir):
257
+ os.makedirs(output_chunk_dir)
258
+
259
+
260
+ # load vocoder
261
+
262
+ if vocoder_name == "vocos":
263
+ vocoder_local_path = str(CKPTS_ROOT / "vocos-mel-24khz")
264
+ elif vocoder_name == "bigvgan":
265
+ vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
266
+
267
+ vocoder = load_vocoder(
268
+ vocoder_name=vocoder_name, is_local=load_vocoder_from_local, local_path=vocoder_local_path, device=device
269
+ )
270
+
271
+
272
+ # load TTS model
273
+
274
+ model_cfg = OmegaConf.load(
275
+ args.model_cfg or config.get("model_cfg", str(files("lemas_tts").joinpath(f"configs/{model}.yaml")))
276
+ )
277
+ model_cls = get_class(f"lemas_tts.model.{model_cfg.model.backbone}")
278
+ model_arc = model_cfg.model.arch
279
+
280
+ repo_name, ckpt_step, ckpt_type = "F5-TTS", 1250000, "safetensors"
281
+
282
+ if model != "F5TTS_Base":
283
+ assert vocoder_name == model_cfg.model.mel_spec.mel_spec_type
284
+
285
+ # override for previous models
286
+ if model == "F5TTS_Base":
287
+ if vocoder_name == "vocos":
288
+ ckpt_step = 1200000
289
+ elif vocoder_name == "bigvgan":
290
+ model = "F5TTS_Base_bigvgan"
291
+ ckpt_type = "pt"
292
+ elif model == "E2TTS_Base":
293
+ repo_name = "E2-TTS"
294
+ ckpt_step = 1200000
295
+
296
+ if not ckpt_file:
297
+ ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{model}/model_{ckpt_step}.{ckpt_type}"))
298
+
299
+ print(f"Using {model}...")
300
+ ema_model = load_model(
301
+ model_cls, model_arc, ckpt_file, mel_spec_type=vocoder_name, vocab_file=vocab_file, device=device
302
+ )
303
+
304
+
305
+ # inference process
306
+
307
+
308
+ def main():
309
+ main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
310
+ if "voices" not in config:
311
+ voices = {"main": main_voice}
312
+ else:
313
+ voices = config["voices"]
314
+ voices["main"] = main_voice
315
+ for voice in voices:
316
+ print("Voice:", voice)
317
+ print("ref_audio ", voices[voice]["ref_audio"])
318
+ voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text(
319
+ voices[voice]["ref_audio"], voices[voice]["ref_text"]
320
+ )
321
+ print("ref_audio_", voices[voice]["ref_audio"], "\n\n")
322
+
323
+ generated_audio_segments = []
324
+ reg1 = r"(?=\[\w+\])"
325
+ chunks = re.split(reg1, gen_text)
326
+ reg2 = r"\[(\w+)\]"
327
+ for text in chunks:
328
+ if not text.strip():
329
+ continue
330
+ match = re.match(reg2, text)
331
+ if match:
332
+ voice = match[1]
333
+ else:
334
+ print("No voice tag found, using main.")
335
+ voice = "main"
336
+ if voice not in voices:
337
+ print(f"Voice {voice} not found, using main.")
338
+ voice = "main"
339
+ text = re.sub(reg2, "", text)
340
+ ref_audio_ = voices[voice]["ref_audio"]
341
+ ref_text_ = voices[voice]["ref_text"]
342
+ gen_text_ = text.strip()
343
+ print(f"Voice: {voice}")
344
+ audio_segment, final_sample_rate, spectragram = infer_process(
345
+ ref_audio_,
346
+ ref_text_,
347
+ gen_text_,
348
+ ema_model,
349
+ vocoder,
350
+ mel_spec_type=vocoder_name,
351
+ target_rms=target_rms,
352
+ cross_fade_duration=cross_fade_duration,
353
+ nfe_step=nfe_step,
354
+ cfg_strength=cfg_strength,
355
+ sway_sampling_coef=sway_sampling_coef,
356
+ speed=speed,
357
+ fix_duration=fix_duration,
358
+ device=device,
359
+ )
360
+ generated_audio_segments.append(audio_segment)
361
+
362
+ if save_chunk:
363
+ if len(gen_text_) > 200:
364
+ gen_text_ = gen_text_[:200] + " ... "
365
+ sf.write(
366
+ os.path.join(output_chunk_dir, f"{len(generated_audio_segments) - 1}_{gen_text_}.wav"),
367
+ audio_segment,
368
+ final_sample_rate,
369
+ )
370
+
371
+ if generated_audio_segments:
372
+ final_wave = np.concatenate(generated_audio_segments)
373
+
374
+ if not os.path.exists(output_dir):
375
+ os.makedirs(output_dir)
376
+
377
+ with open(wave_path, "wb") as f:
378
+ sf.write(f.name, final_wave, final_sample_rate)
379
+ # Remove silence
380
+ if remove_silence:
381
+ remove_silence_for_generated_wav(f.name)
382
+ print(f.name)
383
+
384
+
385
+ if __name__ == "__main__":
386
+ main()
lemas_tts/infer/text_norm/__init__.py ADDED
File without changes
lemas_tts/infer/text_norm/cn_tn.py ADDED
@@ -0,0 +1,824 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # coding=utf-8
3
+ # Authors:
4
+ # 2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git)
5
+ # 2019.9 Jiayu DU
6
+ #
7
+ # requirements:
8
+ # - python 3.X
9
+ # notes: python 2.X WILL fail or produce misleading results
10
+
11
+ import sys, os, argparse, codecs, string, re, unicodedata
12
+
13
+ # ================================================================================ #
14
+ # basic constant
15
+ # ================================================================================ #
16
+ CHINESE_DIGIS = u'零一二三四五六七八九'
17
+ BIG_CHINESE_DIGIS_SIMPLIFIED = u'零壹贰叁肆伍陆柒捌玖'
18
+ BIG_CHINESE_DIGIS_TRADITIONAL = u'零壹貳參肆伍陸柒捌玖'
19
+ SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = u'十百千万'
20
+ SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = u'拾佰仟萬'
21
+ LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'亿兆京垓秭穰沟涧正载'
22
+ LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'億兆京垓秭穰溝澗正載'
23
+ SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'十百千万'
24
+ SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'拾佰仟萬'
25
+
26
+ ZERO_ALT = u'〇'
27
+ ONE_ALT = u'幺'
28
+ TWO_ALTS = [u'两', u'兩']
29
+
30
+ POSITIVE = [u'正', u'正']
31
+ NEGATIVE = [u'负', u'負']
32
+ POINT = [u'点', u'點']
33
+ # PLUS = [u'加', u'加']
34
+ # SIL = [u'杠', u'槓']
35
+
36
+ # 中文数字系统类型
37
+ NUMBERING_TYPES = ['low', 'mid', 'high']
38
+
39
+ CURRENCY_NAMES = '(人民币|美元|日元|英镑|欧元|马克|法郎|加拿大元|澳元|港币|先令|芬兰马克|爱尔兰镑|' \
40
+ '里拉|荷兰盾|埃斯库多|比塞塔|印尼盾|林吉特|新西兰元|比索|卢布|新加坡元|韩元|泰铢)'
41
+ CURRENCY_UNITS = '((亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'
42
+ COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|' \
43
+ '砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|' \
44
+ '针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|' \
45
+ '毫|厘|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|' \
46
+ '盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|' \
47
+ '纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块)'
48
+
49
+ # punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git)
50
+ CHINESE_PUNC_STOP = '!?。。'
51
+ CHINESE_PUNC_NON_STOP = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏'
52
+ CHINESE_PUNC_OTHER = '·〈〉-'
53
+ CHINESE_PUNC_LIST = CHINESE_PUNC_STOP + CHINESE_PUNC_NON_STOP + CHINESE_PUNC_OTHER
54
+
55
+ # ================================================================================ #
56
+ # basic class
57
+ # ================================================================================ #
58
+ class ChineseChar(object):
59
+ """
60
+ 中文字符
61
+ 每个字符对应简体和繁体,
62
+ e.g. 简体 = '负', 繁体 = '負'
63
+ 转换时可转换为简体或繁体
64
+ """
65
+
66
+ def __init__(self, simplified, traditional):
67
+ self.simplified = simplified
68
+ self.traditional = traditional
69
+ #self.__repr__ = self.__str__
70
+
71
+ def __str__(self):
72
+ return self.simplified or self.traditional or None
73
+
74
+ def __repr__(self):
75
+ return self.__str__()
76
+
77
+
78
+ class ChineseNumberUnit(ChineseChar):
79
+ """
80
+ 中文数字/数位字符
81
+ 每个字符除繁简体外还有一个额外的大写字符
82
+ e.g. '陆' 和 '陸'
83
+ """
84
+
85
+ def __init__(self, power, simplified, traditional, big_s, big_t):
86
+ super(ChineseNumberUnit, self).__init__(simplified, traditional)
87
+ self.power = power
88
+ self.big_s = big_s
89
+ self.big_t = big_t
90
+
91
+ def __str__(self):
92
+ return '10^{}'.format(self.power)
93
+
94
+ @classmethod
95
+ def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
96
+
97
+ if small_unit:
98
+ return ChineseNumberUnit(power=index + 1,
99
+ simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1])
100
+ elif numbering_type == NUMBERING_TYPES[0]:
101
+ return ChineseNumberUnit(power=index + 8,
102
+ simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
103
+ elif numbering_type == NUMBERING_TYPES[1]:
104
+ return ChineseNumberUnit(power=(index + 2) * 4,
105
+ simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
106
+ elif numbering_type == NUMBERING_TYPES[2]:
107
+ return ChineseNumberUnit(power=pow(2, index + 3),
108
+ simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
109
+ else:
110
+ raise ValueError(
111
+ 'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type))
112
+
113
+
114
+ class ChineseNumberDigit(ChineseChar):
115
+ """
116
+ 中文数字字符
117
+ """
118
+
119
+ def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None):
120
+ super(ChineseNumberDigit, self).__init__(simplified, traditional)
121
+ self.value = value
122
+ self.big_s = big_s
123
+ self.big_t = big_t
124
+ self.alt_s = alt_s
125
+ self.alt_t = alt_t
126
+
127
+ def __str__(self):
128
+ return str(self.value)
129
+
130
+ @classmethod
131
+ def create(cls, i, v):
132
+ return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
133
+
134
+
135
+ class ChineseMath(ChineseChar):
136
+ """
137
+ 中文数位字符
138
+ """
139
+
140
+ def __init__(self, simplified, traditional, symbol, expression=None):
141
+ super(ChineseMath, self).__init__(simplified, traditional)
142
+ self.symbol = symbol
143
+ self.expression = expression
144
+ self.big_s = simplified
145
+ self.big_t = traditional
146
+
147
+
148
+ CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
149
+
150
+
151
+ class NumberSystem(object):
152
+ """
153
+ 中文数字系统
154
+ """
155
+ pass
156
+
157
+
158
+ class MathSymbol(object):
159
+ """
160
+ 用于中文数字系统的数学符号 (繁/简体), e.g.
161
+ positive = ['正', '正']
162
+ negative = ['负', '負']
163
+ point = ['点', '點']
164
+ """
165
+
166
+ def __init__(self, positive, negative, point):
167
+ self.positive = positive
168
+ self.negative = negative
169
+ self.point = point
170
+
171
+ def __iter__(self):
172
+ for v in self.__dict__.values():
173
+ yield v
174
+
175
+
176
+ # class OtherSymbol(object):
177
+ # """
178
+ # 其他符号
179
+ # """
180
+ #
181
+ # def __init__(self, sil):
182
+ # self.sil = sil
183
+ #
184
+ # def __iter__(self):
185
+ # for v in self.__dict__.values():
186
+ # yield v
187
+
188
+
189
+ # ================================================================================ #
190
+ # basic utils
191
+ # ================================================================================ #
192
+ def create_system(numbering_type=NUMBERING_TYPES[1]):
193
+ """
194
+ 根据数字系统类型返回创建相应的数字系统,默认为 mid
195
+ NUMBERING_TYPES = ['low', 'mid', 'high']: 中文数字系统类型
196
+ low: '兆' = '亿' * '十' = $10^{9}$, '京' = '兆' * '十', etc.
197
+ mid: '兆' = '亿' * '万' = $10^{12}$, '京' = '兆' * '万', etc.
198
+ high: '兆' = '亿' * '亿' = $10^{16}$, '京' = '兆' * '兆', etc.
199
+ 返回对应的数字系统
200
+ """
201
+
202
+ # chinese number units of '亿' and larger
203
+ all_larger_units = zip(
204
+ LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL)
205
+ larger_units = [CNU.create(i, v, numbering_type, False)
206
+ for i, v in enumerate(all_larger_units)]
207
+ # chinese number units of '十, 百, 千, 万'
208
+ all_smaller_units = zip(
209
+ SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL)
210
+ smaller_units = [CNU.create(i, v, small_unit=True)
211
+ for i, v in enumerate(all_smaller_units)]
212
+ # digis
213
+ chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS,
214
+ BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL)
215
+ digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)]
216
+ digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT
217
+ digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT
218
+ digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1]
219
+
220
+ # symbols
221
+ positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x)
222
+ negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x)
223
+ point_cn = CM(POINT[0], POINT[1], '.', lambda x,
224
+ y: float(str(x) + '.' + str(y)))
225
+ # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y)))
226
+ system = NumberSystem()
227
+ system.units = smaller_units + larger_units
228
+ system.digits = digits
229
+ system.math = MathSymbol(positive_cn, negative_cn, point_cn)
230
+ # system.symbols = OtherSymbol(sil_cn)
231
+ return system
232
+
233
+
234
+ def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):
235
+
236
+ def get_symbol(char, system):
237
+ for u in system.units:
238
+ if char in [u.traditional, u.simplified, u.big_s, u.big_t]:
239
+ return u
240
+ for d in system.digits:
241
+ if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]:
242
+ return d
243
+ for m in system.math:
244
+ if char in [m.traditional, m.simplified]:
245
+ return m
246
+
247
+ def string2symbols(chinese_string, system):
248
+ int_string, dec_string = chinese_string, ''
249
+ for p in [system.math.point.simplified, system.math.point.traditional]:
250
+ if p in chinese_string:
251
+ int_string, dec_string = chinese_string.split(p)
252
+ break
253
+ return [get_symbol(c, system) for c in int_string], \
254
+ [get_symbol(c, system) for c in dec_string]
255
+
256
+ def correct_symbols(integer_symbols, system):
257
+ """
258
+ 一百八 to 一百八十
259
+ 一亿一千三百万 to 一亿 一千万 三百万
260
+ """
261
+
262
+ if integer_symbols and isinstance(integer_symbols[0], CNU):
263
+ if integer_symbols[0].power == 1:
264
+ integer_symbols = [system.digits[1]] + integer_symbols
265
+
266
+ if len(integer_symbols) > 1:
267
+ if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU):
268
+ integer_symbols.append(
269
+ CNU(integer_symbols[-2].power - 1, None, None, None, None))
270
+
271
+ result = []
272
+ unit_count = 0
273
+ for s in integer_symbols:
274
+ if isinstance(s, CND):
275
+ result.append(s)
276
+ unit_count = 0
277
+ elif isinstance(s, CNU):
278
+ current_unit = CNU(s.power, None, None, None, None)
279
+ unit_count += 1
280
+
281
+ if unit_count == 1:
282
+ result.append(current_unit)
283
+ elif unit_count > 1:
284
+ for i in range(len(result)):
285
+ if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power:
286
+ result[-i - 1] = CNU(result[-i - 1].power +
287
+ current_unit.power, None, None, None, None)
288
+ return result
289
+
290
+ def compute_value(integer_symbols):
291
+ """
292
+ Compute the value.
293
+ When current unit is larger than previous unit, current unit * all previous units will be used as all previous units.
294
+ e.g. '两千万' = 2000 * 10000 not 2000 + 10000
295
+ """
296
+ value = [0]
297
+ last_power = 0
298
+ for s in integer_symbols:
299
+ if isinstance(s, CND):
300
+ value[-1] = s.value
301
+ elif isinstance(s, CNU):
302
+ value[-1] *= pow(10, s.power)
303
+ if s.power > last_power:
304
+ value[:-1] = list(map(lambda v: v *
305
+ pow(10, s.power), value[:-1]))
306
+ last_power = s.power
307
+ value.append(0)
308
+ return sum(value)
309
+
310
+ system = create_system(numbering_type)
311
+ int_part, dec_part = string2symbols(chinese_string, system)
312
+ int_part = correct_symbols(int_part, system)
313
+ int_str = str(compute_value(int_part))
314
+ dec_str = ''.join([str(d.value) for d in dec_part])
315
+ if dec_part:
316
+ return '{0}.{1}'.format(int_str, dec_str)
317
+ else:
318
+ return int_str
319
+
320
+
321
+ def num2chn(number_string, numbering_type=NUMBERING_TYPES[1], big=False,
322
+ traditional=False, alt_zero=False, alt_one=False, alt_two=True,
323
+ use_zeros=True, use_units=True):
324
+
325
+ def get_value(value_string, use_zeros=True):
326
+
327
+ striped_string = value_string.lstrip('0')
328
+
329
+ # record nothing if all zeros
330
+ if not striped_string:
331
+ return []
332
+
333
+ # record one digits
334
+ elif len(striped_string) == 1:
335
+ if use_zeros and len(value_string) != len(striped_string):
336
+ return [system.digits[0], system.digits[int(striped_string)]]
337
+ else:
338
+ return [system.digits[int(striped_string)]]
339
+
340
+ # recursively record multiple digits
341
+ else:
342
+ result_unit = next(u for u in reversed(
343
+ system.units) if u.power < len(striped_string))
344
+ result_string = value_string[:-result_unit.power]
345
+ return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power:])
346
+
347
+ system = create_system(numbering_type)
348
+
349
+ int_dec = number_string.split('.')
350
+ if len(int_dec) == 1:
351
+ int_string = int_dec[0]
352
+ dec_string = ""
353
+ elif len(int_dec) == 2:
354
+ int_string = int_dec[0]
355
+ dec_string = int_dec[1]
356
+ else:
357
+ raise ValueError(
358
+ "invalid input num string with more than one dot: {}".format(number_string))
359
+
360
+ if use_units and len(int_string) > 1:
361
+ result_symbols = get_value(int_string)
362
+ else:
363
+ result_symbols = [system.digits[int(c)] for c in int_string]
364
+ dec_symbols = [system.digits[int(c)] for c in dec_string]
365
+ if dec_string:
366
+ result_symbols += [system.math.point] + dec_symbols
367
+
368
+ if alt_two:
369
+ liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t,
370
+ system.digits[2].big_s, system.digits[2].big_t)
371
+ for i, v in enumerate(result_symbols):
372
+ if isinstance(v, CND) and v.value == 2:
373
+ next_symbol = result_symbols[i +
374
+ 1] if i < len(result_symbols) - 1 else None
375
+ previous_symbol = result_symbols[i - 1] if i > 0 else None
376
+ if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))):
377
+ if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)):
378
+ result_symbols[i] = liang
379
+
380
+ # if big is True, '两' will not be used and `alt_two` has no impact on output
381
+ if big:
382
+ attr_name = 'big_'
383
+ if traditional:
384
+ attr_name += 't'
385
+ else:
386
+ attr_name += 's'
387
+ else:
388
+ if traditional:
389
+ attr_name = 'traditional'
390
+ else:
391
+ attr_name = 'simplified'
392
+
393
+ result = ''.join([getattr(s, attr_name) for s in result_symbols])
394
+
395
+ # if not use_zeros:
396
+ # result = result.strip(getattr(system.digits[0], attr_name))
397
+
398
+ if alt_zero:
399
+ result = result.replace(
400
+ getattr(system.digits[0], attr_name), system.digits[0].alt_s)
401
+
402
+ if alt_one:
403
+ result = result.replace(
404
+ getattr(system.digits[1], attr_name), system.digits[1].alt_s)
405
+
406
+ for i, p in enumerate(POINT):
407
+ if result.startswith(p):
408
+ return CHINESE_DIGIS[0] + result
409
+
410
+ # ^10, 11, .., 19
411
+ if len(result) >= 2 and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0],
412
+ SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] and \
413
+ result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]:
414
+ result = result[1:]
415
+
416
+ return result
417
+
418
+
419
+ # ================================================================================ #
420
+ # different types of rewriters
421
+ # ================================================================================ #
422
+ class Cardinal:
423
+ """
424
+ CARDINAL类
425
+ """
426
+
427
+ def __init__(self, cardinal=None, chntext=None):
428
+ self.cardinal = cardinal
429
+ self.chntext = chntext
430
+
431
+ def chntext2cardinal(self):
432
+ return chn2num(self.chntext)
433
+
434
+ def cardinal2chntext(self):
435
+ return num2chn(self.cardinal)
436
+
437
+ class Digit:
438
+ """
439
+ DIGIT类
440
+ """
441
+
442
+ def __init__(self, digit=None, chntext=None):
443
+ self.digit = digit
444
+ self.chntext = chntext
445
+
446
+ # def chntext2digit(self):
447
+ # return chn2num(self.chntext)
448
+
449
+ def digit2chntext(self):
450
+ return num2chn(self.digit, alt_two=False, use_units=False)
451
+
452
+
453
+ class TelePhone:
454
+ """
455
+ TELEPHONE类
456
+ """
457
+
458
+ def __init__(self, telephone=None, raw_chntext=None, chntext=None):
459
+ self.telephone = telephone
460
+ self.raw_chntext = raw_chntext
461
+ self.chntext = chntext
462
+
463
+ # def chntext2telephone(self):
464
+ # sil_parts = self.raw_chntext.split('<SIL>')
465
+ # self.telephone = '-'.join([
466
+ # str(chn2num(p)) for p in sil_parts
467
+ # ])
468
+ # return self.telephone
469
+
470
+ def telephone2chntext(self, fixed=False):
471
+
472
+ if fixed:
473
+ sil_parts = self.telephone.split('-')
474
+ self.raw_chntext = '<SIL>'.join([
475
+ num2chn(part, alt_two=False, use_units=False) for part in sil_parts
476
+ ])
477
+ self.chntext = self.raw_chntext.replace('<SIL>', '')
478
+ else:
479
+ sp_parts = self.telephone.strip('+').split()
480
+ self.raw_chntext = '<SP>'.join([
481
+ num2chn(part, alt_two=False, use_units=False) for part in sp_parts
482
+ ])
483
+ self.chntext = self.raw_chntext.replace('<SP>', '')
484
+ return self.chntext
485
+
486
+
487
+ class Fraction:
488
+ """
489
+ FRACTION类
490
+ """
491
+
492
+ def __init__(self, fraction=None, chntext=None):
493
+ self.fraction = fraction
494
+ self.chntext = chntext
495
+
496
+ def chntext2fraction(self):
497
+ denominator, numerator = self.chntext.split('分之')
498
+ return chn2num(numerator) + '/' + chn2num(denominator)
499
+
500
+ def fraction2chntext(self):
501
+ numerator, denominator = self.fraction.split('/')
502
+ return num2chn(denominator) + '分之' + num2chn(numerator)
503
+
504
+
505
+ class Date:
506
+ """
507
+ DATE类
508
+ """
509
+
510
+ def __init__(self, date=None, chntext=None):
511
+ self.date = date
512
+ self.chntext = chntext
513
+
514
+ # def chntext2date(self):
515
+ # chntext = self.chntext
516
+ # try:
517
+ # year, other = chntext.strip().split('年', maxsplit=1)
518
+ # year = Digit(chntext=year).digit2chntext() + '年'
519
+ # except ValueError:
520
+ # other = chntext
521
+ # year = ''
522
+ # if other:
523
+ # try:
524
+ # month, day = other.strip().split('月', maxsplit=1)
525
+ # month = Cardinal(chntext=month).chntext2cardinal() + '月'
526
+ # except ValueError:
527
+ # day = chntext
528
+ # month = ''
529
+ # if day:
530
+ # day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1]
531
+ # else:
532
+ # month = ''
533
+ # day = ''
534
+ # date = year + month + day
535
+ # self.date = date
536
+ # return self.date
537
+
538
+ def date2chntext(self):
539
+ date = self.date
540
+ try:
541
+ year, other = date.strip().split('年', 1)
542
+ year = Digit(digit=year).digit2chntext() + '年'
543
+ except ValueError:
544
+ other = date
545
+ year = ''
546
+ if other:
547
+ try:
548
+ month, day = other.strip().split('月', 1)
549
+ month = Cardinal(cardinal=month).cardinal2chntext() + '月'
550
+ except ValueError:
551
+ day = date
552
+ month = ''
553
+ if day:
554
+ day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1]
555
+ else:
556
+ month = ''
557
+ day = ''
558
+ chntext = year + month + day
559
+ self.chntext = chntext
560
+ return self.chntext
561
+
562
+ class Time:
563
+ """
564
+ MONEY类
565
+ """
566
+
567
+ def __init__(self, time=None, chntext=None):
568
+ self.time = time
569
+ self.chntext = chntext
570
+
571
+ # def chntext2money(self):
572
+ # return self.money
573
+
574
+ def time2chntext(self):
575
+ time = self.time.replace('-', '至')
576
+ pattern = re.compile(r'(\d{1,2}:\d{1,2}(:)?(\d{1,2})?)')
577
+ matchers = pattern.findall(time)
578
+ if matchers:
579
+ if len(matchers[0])>2:
580
+ time = time.replace(':', '时', 1)
581
+ time = time.replace(':', '分', 1)
582
+ self.chntext = time
583
+ return self.chntext
584
+
585
+ class Money:
586
+ """
587
+ MONEY类
588
+ """
589
+
590
+ def __init__(self, money=None, chntext=None):
591
+ self.money = money
592
+ self.chntext = chntext
593
+
594
+ # def chntext2money(self):
595
+ # return self.money
596
+
597
+ def money2chntext(self):
598
+ money = self.money
599
+ pattern = re.compile(r'(\d+(\.\d+)?)')
600
+ matchers = pattern.findall(money)
601
+ if matchers:
602
+ for matcher in matchers:
603
+ money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext())
604
+ self.chntext = money
605
+ return self.chntext
606
+
607
+
608
+ class Percentage:
609
+ """
610
+ PERCENTAGE类
611
+ """
612
+
613
+ def __init__(self, percentage=None, chntext=None):
614
+ self.percentage = percentage
615
+ self.chntext = chntext
616
+
617
+ def chntext2percentage(self):
618
+ return chn2num(self.chntext.strip().strip('百分之')) + '%'
619
+
620
+ def percentage2chntext(self):
621
+ return '百分之' + num2chn(self.percentage.strip().strip('%'))
622
+
623
+
624
+ # ================================================================================ #
625
+ # NSW Normalizer
626
+ # ================================================================================ #
627
+ class NSWNormalizer:
628
+ def __init__(self):
629
+ self.raw_text = ' ' # '^' + raw_text + '$'
630
+ self.norm_text = ''
631
+
632
+ def _particular(self):
633
+ text = self.norm_text
634
+ pattern = re.compile(r"(([a-zA-Z]+)二([a-zA-Z]+))")
635
+ matchers = pattern.findall(text)
636
+ if matchers:
637
+ # print('particular')
638
+ for matcher in matchers:
639
+ text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1)
640
+ self.norm_text = text
641
+ return self.norm_text
642
+
643
+ def normalize(self, raw_text):
644
+ self.raw_text = '^' + raw_text + '$'
645
+ text = unicodedata.normalize("NFKC", self.raw_text)
646
+ # 规范化日期
647
+ pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})年)?(\d{1,2}月(\d{1,2}[日号])?)?)")
648
+ matchers = pattern.findall(text)
649
+ if matchers:
650
+ #print('date')
651
+ for matcher in matchers:
652
+ text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
653
+
654
+ # 规范化时间
655
+ pattern = re.compile(r"\D+((\d{1,2}-)?\d{1,2}[时点:]((\d{1,2}-)?\d{1,2}[分:]((\d{1,2}-)?\d{1,2}秒)?)?)")
656
+ matchers = pattern.findall(text)
657
+ if matchers:
658
+ #print('time')
659
+ for matcher in matchers:
660
+ text = text.replace(matcher[0], Time(time=matcher[0]).time2chntext(), 1)
661
+
662
+ # 规范化金钱
663
+ pattern = re.compile(r"\D+((\d+(\.\d+)?)[多余几]?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)")
664
+ matchers = pattern.findall(text)
665
+ if matchers:
666
+ #print('money')
667
+ for matcher in matchers:
668
+ text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1)
669
+
670
+ # 规范化固话/手机号码
671
+ # 手机
672
+ # http://www.jihaoba.com/news/show/13680
673
+ # 移动:139、138、137、136、135、134、159、158、157、150、151、152、188、187、182、183、184、178、198
674
+ # 联通:130、131、132、156、155、186、185、176
675
+ # 电信:133、153、189、180、181、177
676
+ pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
677
+ matchers = pattern.findall(text)
678
+ if matchers:
679
+ #print('telephone')
680
+ for matcher in matchers:
681
+ text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1)
682
+ # 固话
683
+ pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
684
+ matchers = pattern.findall(text)
685
+ if matchers:
686
+ # print('fixed telephone')
687
+ for matcher in matchers:
688
+ text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1)
689
+
690
+ # 规范化分数
691
+ pattern = re.compile(r"(\d+/\d+)")
692
+ matchers = pattern.findall(text)
693
+ if matchers:
694
+ #print('fraction')
695
+ for matcher in matchers:
696
+ text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1)
697
+
698
+ # 规范化百分数
699
+ text = text.replace('%', '%')
700
+ pattern = re.compile(r"(\d+(\.\d+)?%)")
701
+ matchers = pattern.findall(text)
702
+ if matchers:
703
+ #print('percentage')
704
+ for matcher in matchers:
705
+ text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1)
706
+
707
+ # 规范化纯数+量词
708
+ pattern = re.compile(r"(\d+(\.\d+)?)[多余几]?" + COM_QUANTIFIERS)
709
+ matchers = pattern.findall(text)
710
+ if matchers:
711
+ #print('cardinal+quantifier')
712
+ for matcher in matchers:
713
+ text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
714
+
715
+ # 规范化数字编号
716
+ pattern = re.compile(r"(\d{2,32})")
717
+ matchers = pattern.findall(text)
718
+ if matchers:
719
+ #print('digit')
720
+ for matcher in matchers:
721
+ text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1)
722
+
723
+ # 规范化纯数
724
+ pattern = re.compile(r"(\d+(\.\d+)?)")
725
+ matchers = pattern.findall(text)
726
+ if matchers:
727
+ #print('cardinal')
728
+ for matcher in matchers:
729
+ text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
730
+
731
+ self.norm_text = text
732
+ self._particular()
733
+
734
+ return self.norm_text.lstrip('^').rstrip('$')
735
+
736
+
737
+ def nsw_test_case(raw_text):
738
+ print('I:' + raw_text)
739
+ print('O:' + NSWNormalizer(raw_text).normalize())
740
+ print('')
741
+
742
+
743
+ def nsw_test():
744
+ nsw_test_case('固话:0595-23865596或23880880。')
745
+ nsw_test_case('固话:0595-23865596或23880880。')
746
+ nsw_test_case('手机:+86 19859213959或15659451527。')
747
+ nsw_test_case('分数:32477/76391。')
748
+ nsw_test_case('百分数:80.03%。')
749
+ nsw_test_case('编号:31520181154418。')
750
+ nsw_test_case('纯数:2983.07克或12345.60米。')
751
+ nsw_test_case('日期:1999年2月20日或09年3月15号。')
752
+ nsw_test_case('金钱:12块5,34.5元,20.1万')
753
+ nsw_test_case('特殊:O2O或B2C。')
754
+ nsw_test_case('3456万吨')
755
+ nsw_test_case('2938个')
756
+ nsw_test_case('938')
757
+ nsw_test_case('今天吃了115个小笼包231个馒头')
758
+ nsw_test_case('有62%的概率')
759
+
760
+
761
+ if __name__ == '__main__':
762
+ #nsw_test()
763
+
764
+ p = argparse.ArgumentParser()
765
+ p.add_argument('ifile', help='input filename, assume utf-8 encoding')
766
+ p.add_argument('ofile', help='output filename')
767
+ p.add_argument('--to_upper', action='store_true', help='convert to upper case')
768
+ p.add_argument('--to_lower', action='store_true', help='convert to lower case')
769
+ p.add_argument('--has_key', action='store_true', help="input text has Kaldi's key as first field.")
770
+ p.add_argument('--log_interval', type=int, default=100000, help='log interval in number of processed lines')
771
+ args = p.parse_args()
772
+
773
+ ifile = codecs.open(args.ifile, 'r', 'utf8')
774
+ ofile = codecs.open(args.ofile, 'w+', 'utf8')
775
+
776
+ n = 0
777
+ for l in ifile:
778
+ key = ''
779
+ text = ''
780
+ if args.has_key:
781
+ cols = l.split(maxsplit=1)
782
+ key = cols[0]
783
+ if len(cols) == 2:
784
+ text = cols[1].strip()
785
+ else:
786
+ text = ''
787
+ else:
788
+ text = l.strip()
789
+
790
+ # cases
791
+ if args.to_upper and args.to_lower:
792
+ sys.stderr.write('cn_tn.py: to_upper OR to_lower?')
793
+ exit(1)
794
+ if args.to_upper:
795
+ text = text.upper()
796
+ if args.to_lower:
797
+ text = text.lower()
798
+
799
+ # NSW(Non-Standard-Word) normalization
800
+ text = NSWNormalizer(text).normalize()
801
+
802
+ # Punctuations removal
803
+ old_chars = CHINESE_PUNC_LIST + string.punctuation # includes all CN and EN punctuations
804
+ new_chars = ' ' * len(old_chars)
805
+ del_chars = ''
806
+ text = text.translate(str.maketrans(old_chars, new_chars, del_chars))
807
+
808
+ #
809
+ if args.has_key:
810
+ ofile.write(key + '\t' + text + '\n')
811
+ else:
812
+ if text.strip() != '': # skip empty line in pure text format(without Kaldi's utt key)
813
+ ofile.write(text + '\n')
814
+
815
+ n += 1
816
+ if n % args.log_interval == 0:
817
+ sys.stderr.write("cn_tn.py: {} lines done.\n".format(n))
818
+ sys.stderr.flush()
819
+
820
+ sys.stderr.write("cn_tn.py: {} lines done in total.\n".format(n))
821
+ sys.stderr.flush()
822
+
823
+ ifile.close()
824
+ ofile.close()
lemas_tts/infer/text_norm/en_tn.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) 2017 Keith Ito
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
5
+ # of this software and associated documentation files (the "Software"), to deal
6
+ # in the Software without restriction, including without limitation the rights
7
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
+ # copies of the Software, and to permit persons to whom the Software is
9
+ # furnished to do so, subject to the following conditions:
10
+
11
+ # The above copyright notice and this permission notice shall be included in
12
+ # all copies or substantial portions of the Software.
13
+
14
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
+ # THE SOFTWARE.
21
+
22
+ import re
23
+ from unidecode import unidecode
24
+ import inflect
25
+
26
+ _inflect = inflect.engine()
27
+ _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
28
+ _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
29
+ _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
30
+ _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
31
+ _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
32
+ _number_re = re.compile(r"[0-9]+")
33
+
34
+
35
+ def _remove_commas(m):
36
+ return m.group(1).replace(",", "")
37
+
38
+
39
+ def _expand_decimal_point(m):
40
+ return m.group(1).replace(".", " point ")
41
+
42
+
43
+ def _expand_dollars(m):
44
+ match = m.group(1)
45
+ parts = match.split(".")
46
+ if len(parts) > 2:
47
+ return match + " dollars" # Unexpected format
48
+ dollars = int(parts[0]) if parts[0] else 0
49
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
50
+ if dollars and cents:
51
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
52
+ cent_unit = "cent" if cents == 1 else "cents"
53
+ return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
54
+ elif dollars:
55
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
56
+ return "%s %s" % (dollars, dollar_unit)
57
+ elif cents:
58
+ cent_unit = "cent" if cents == 1 else "cents"
59
+ return "%s %s" % (cents, cent_unit)
60
+ else:
61
+ return "zero dollars"
62
+
63
+
64
+ def _expand_ordinal(m):
65
+ return _inflect.number_to_words(m.group(0))
66
+
67
+
68
+ def _expand_number(m):
69
+ num = int(m.group(0))
70
+ if num > 1000 and num < 3000:
71
+ if num == 2000:
72
+ return "two thousand"
73
+ elif num > 2000 and num < 2010:
74
+ return "two thousand " + _inflect.number_to_words(num % 100)
75
+ elif num % 100 == 0:
76
+ return _inflect.number_to_words(num // 100) + " hundred"
77
+ else:
78
+ return _inflect.number_to_words(
79
+ num, andword="", zero="oh", group=2
80
+ ).replace(", ", " ")
81
+ else:
82
+ return _inflect.number_to_words(num, andword="")
83
+
84
+
85
+ def normalize_numbers(text):
86
+ text = re.sub(_comma_number_re, _remove_commas, text)
87
+ text = re.sub(_pounds_re, r"\1 pounds", text)
88
+ text = re.sub(_dollars_re, _expand_dollars, text)
89
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
90
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
91
+ text = re.sub(_number_re, _expand_number, text)
92
+ return text
93
+
94
+ # Regular expression matching whitespace:
95
+ _whitespace_re = re.compile(r"\s+")
96
+
97
+ # List of (regular expression, replacement) pairs for abbreviations:
98
+ _abbreviations = [
99
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
100
+ for x in [
101
+ ("mrs", "misess"),
102
+ ("mr", "mister"),
103
+ ("dr", "doctor"),
104
+ ("st", "saint"),
105
+ ("co", "company"),
106
+ ("jr", "junior"),
107
+ ("maj", "major"),
108
+ ("gen", "general"),
109
+ ("drs", "doctors"),
110
+ ("rev", "reverend"),
111
+ ("lt", "lieutenant"),
112
+ ("hon", "honorable"),
113
+ ("sgt", "sergeant"),
114
+ ("capt", "captain"),
115
+ ("esq", "esquire"),
116
+ ("ltd", "limited"),
117
+ ("col", "colonel"),
118
+ ("ft", "fort"),
119
+ ]
120
+ ]
121
+
122
+
123
+ def expand_abbreviations(text):
124
+ for regex, replacement in _abbreviations:
125
+ text = re.sub(regex, replacement, text)
126
+ return text
127
+
128
+
129
+ def expand_numbers(text):
130
+ return normalize_numbers(text)
131
+
132
+
133
+ def lowercase(text):
134
+ return text.lower()
135
+
136
+
137
+ def collapse_whitespace(text):
138
+ return re.sub(_whitespace_re, " ", text)
139
+
140
+
141
+ def convert_to_ascii(text):
142
+ return unidecode(text)
143
+
144
+
145
+ def basic_cleaners(text):
146
+ """Basic pipeline that lowercases and collapses whitespace without transliteration."""
147
+ text = lowercase(text)
148
+ text = collapse_whitespace(text)
149
+ return text
150
+
151
+
152
+ def transliteration_cleaners(text):
153
+ """Pipeline for non-English text that transliterates to ASCII."""
154
+ text = convert_to_ascii(text)
155
+ text = lowercase(text)
156
+ text = collapse_whitespace(text)
157
+ return text
158
+
159
+
160
+ def english_cleaners(text):
161
+ """Pipeline for English text, including number and abbreviation expansion."""
162
+ text = convert_to_ascii(text)
163
+ text = lowercase(text)
164
+ text = expand_numbers(text)
165
+ text = expand_abbreviations(text)
166
+ text = collapse_whitespace(text)
167
+ return text
168
+
169
+ def read_lexicon(lex_path):
170
+ lexicon = {}
171
+ with open(lex_path) as f:
172
+ for line in f:
173
+ temp = re.split(r"\s+", line.strip("\n"))
174
+ word = temp[0]
175
+ phones = temp[1:]
176
+ if word not in lexicon:
177
+ lexicon[word] = phones
178
+ return lexicon
lemas_tts/infer/text_norm/gp2py.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import copy
3
+ import os
4
+ from typing import List
5
+
6
+ import jieba
7
+ import pypinyin
8
+
9
+ SPECIAL_NOTES = '。?!?!.;;:,,:'
10
+
11
+
12
+ def read_vocab(file: os.PathLike) -> List[str]:
13
+ with open(file) as f:
14
+ vocab = f.read().split('\n')
15
+ vocab = [v for v in vocab if len(v) > 0 and v != '\n']
16
+ return vocab
17
+
18
+
19
+ class TextNormal:
20
+ def __init__(self,
21
+ gp_vocab_file: os.PathLike,
22
+ py_vocab_file: os.PathLike,
23
+ add_sp1=False,
24
+ fix_er=False,
25
+ add_sil=True):
26
+ if gp_vocab_file is not None:
27
+ self.gp_vocab = read_vocab(gp_vocab_file)
28
+ if py_vocab_file is not None:
29
+ self.py_vocab = read_vocab(py_vocab_file)
30
+ self.in_py_vocab = dict([(p, True) for p in self.py_vocab])
31
+ self.add_sp1 = add_sp1
32
+ self.add_sil = add_sil
33
+ self.fix_er = fix_er
34
+
35
+ # gp2idx = dict([(c, i) for i, c in enumerate(self.gp_vocab)])
36
+ # idx2gp = dict([(i, c) for i, c in enumerate(self.gp_vocab)])
37
+
38
+ def _split2sent(self, text):
39
+ new_sub = [text]
40
+ while True:
41
+ sub = copy.deepcopy(new_sub)
42
+ new_sub = []
43
+ for s in sub:
44
+ sp = False
45
+ for t in SPECIAL_NOTES:
46
+ if t in s:
47
+ new_sub += s.split(t)
48
+ sp = True
49
+ break
50
+
51
+ if not sp and len(s) > 0:
52
+ new_sub += [s]
53
+ if len(new_sub) == len(sub):
54
+ break
55
+ tokens = [a for a in text if a in SPECIAL_NOTES]
56
+
57
+ return new_sub, tokens
58
+
59
+ def _correct_tone3(self, pys: List[str]) -> List[str]:
60
+ """Fix the continuous tone3 pronunciation problem"""
61
+ for i in range(2, len(pys)):
62
+ if pys[i][-1] == '3' and pys[i - 1][-1] == '3' and pys[i - 2][-1] == '3':
63
+ pys[i - 1] = pys[i - 1][:-1] + '2' # change the middle one
64
+ for i in range(1, len(pys)):
65
+ if pys[i][-1] == '3':
66
+ if pys[i - 1][-1] == '3':
67
+ pys[i - 1] = pys[i - 1][:-1] + '2'
68
+ return pys
69
+
70
+ def _correct_tone4(self, pys: List[str]) -> List[str]:
71
+ """Fixed the problem of pronouncing 不 bu2 yao4 / bu4 neng2"""
72
+ for i in range(len(pys) - 1):
73
+ if pys[i] == 'bu4':
74
+ if pys[i + 1][-1] == '4':
75
+ pys[i] = 'bu2'
76
+ return pys
77
+
78
+ def _replace_with_sp(self, pys: List[str]) -> List[str]:
79
+ for i, p in enumerate(pys):
80
+ if p in ',,、':
81
+ pys[i] = 'sp1'
82
+ return pys
83
+
84
+ def _correct_tone5(self, pys: List[str]) -> List[str]:
85
+ for i in range(len(pys)):
86
+ if pys[i][-1] not in '1234':
87
+ pys[i] += '5'
88
+ return pys
89
+
90
+ def gp2py(self, gp_text: str) -> List[str]:
91
+
92
+ gp_sent_list, tokens = self._split2sent(gp_text)
93
+ py_sent_list = []
94
+ for sent in gp_sent_list:
95
+ pys = []
96
+ for words in list(jieba.cut(sent)):
97
+ py = pypinyin.pinyin(words, pypinyin.TONE3)
98
+ py = [p[0] for p in py]
99
+ pys += py
100
+ if self.add_sp1:
101
+ pys = self._replace_with_sp(pys)
102
+ pys = self._correct_tone3(pys)
103
+ pys = self._correct_tone4(pys)
104
+ pys = self._correct_tone5(pys)
105
+ if self.add_sil:
106
+ py_sent_list += [' '.join(['sil'] + pys + ['sil'])]
107
+ else:
108
+ py_sent_list += [' '.join(pys)]
109
+
110
+ if self.add_sil:
111
+ gp_sent_list = ['sil ' + ' '.join(list(gp)) + ' sil' for gp in gp_sent_list]
112
+ else:
113
+ gp_sent_list = [' '.join(list(gp)) for gp in gp_sent_list]
114
+
115
+ if self.fix_er:
116
+ new_py_sent_list = []
117
+ for py, gp in zip(py_sent_list, gp_sent_list):
118
+ py = self._convert_er2(py, gp)
119
+ new_py_sent_list += [py]
120
+ py_sent_list = new_py_sent_list
121
+ print(new_py_sent_list)
122
+
123
+ return py_sent_list, gp_sent_list
124
+
125
+ def _convert_er2(self, py, gp):
126
+ py2hz = dict([(p, h) for p, h in zip(py.split(), gp.split())])
127
+ py_list = py.split()
128
+ for i, p in enumerate(py_list):
129
+ if (p == 'er2' and py2hz[p] == '儿' and i > 1 and len(py_list[i - 1]) > 2 and py_list[i - 1][-1] in '1234'):
130
+
131
+ py_er = py_list[i - 1][:-1] + 'r' + py_list[i - 1][-1]
132
+
133
+ if self.in_py_vocab.get(py_er, False): # must in vocab
134
+ py_list[i - 1] = py_er
135
+ py_list[i] = 'r'
136
+ py = ' '.join(py_list)
137
+ return py
138
+
139
+
140
+ if __name__ == '__main__':
141
+ parser = argparse.ArgumentParser()
142
+ parser.add_argument('-t', '--text', type=str)
143
+ args = parser.parse_args()
144
+ text = args.text
145
+ tn = TextNormal('gp.vocab', 'py.vocab', add_sp1=True, fix_er=True)
146
+ py_list, gp_list = tn.gp2py(text)
147
+ for py, gp in zip(py_list, gp_list):
148
+ print(py + '|' + gp)
lemas_tts/infer/text_norm/id_tn.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Indonesian TTS Text Normalization for YouTube subtitles
2
+ # Requirements: pip install num2words
3
+ import re
4
+ from num2words import num2words
5
+
6
+ # --- small slang map (expandable) ---
7
+ SLANG_MAP = {
8
+ "gpp": "nggak apa-apa",
9
+ "gak": "nggak", "ga": "nggak", "gk": "nggak",
10
+ "sy": "saya", "sya": "saya",
11
+ "km": "kamu",
12
+ "tp": "tapi", "tpi": "tapi",
13
+ "jd": "jadi",
14
+ "bgt": "banget",
15
+ "blm": "belum",
16
+ "trs": "terus",
17
+ "sm": "sama",
18
+ "wkwk": "wkwk", # keep as-is (laugh token) or strip later
19
+ "wkwkwk": "wkwk"
20
+ }
21
+
22
+ # emoji pattern: removes most emoji blocks
23
+ EMOJI_PATTERN = re.compile(
24
+ "["
25
+ "\U0001F600-\U0001F64F" # emoticons
26
+ "\U0001F300-\U0001F5FF" # symbols & pictographs
27
+ "\U0001F680-\U0001F6FF" # transport & map symbols
28
+ "\U0001F1E0-\U0001F1FF" # flags (iOS)
29
+ "\U00002700-\U000027BF" # dingbats
30
+ "\U000024C2-\U0001F251"
31
+ "]+", flags=re.UNICODE)
32
+
33
+ # units map
34
+ UNITS = {
35
+ "kg": "kilogram","g": "gram","km": "kilometer",
36
+ "m": "meter","cm": "sentimeter","mm": "milimeter",
37
+ "l": "liter"
38
+ }
39
+
40
+ # helper: safe num2words for Indonesian
41
+ def num_to_words_ind(num_str):
42
+ """Convert numeric string to Indonesian words.
43
+ - Handles integers and simple decimals like '1.5' (reads digits after decimal).
44
+ - Removes grouping dots in Indonesian numbers (e.g. '10.000').
45
+ """
46
+ num_str = num_str.strip()
47
+ # remove thousand separators commonly used in Indonesian (dot)
48
+ # but if decimal point (like '1,5' or '1.5'), assume '.' is decimal point (we expect '.' used)
49
+ # We'll treat commas as thousand separators too if no decimal comma present.
50
+ if re.match(r'^\d+[.,]\d+$', num_str):
51
+ # decimal number: normalize to use '.' then split
52
+ s = num_str.replace(',', '.')
53
+ left, right = s.split('.', 1)
54
+ try:
55
+ left_w = num2words(int(left), lang='id')
56
+ except:
57
+ left_w = left
58
+ # read each decimal digit separately
59
+ right_w = " ".join(num2words(int(d), lang='id') for d in right if d.isdigit())
60
+ return f"{left_w} koma {right_w}"
61
+ else:
62
+ # remove non-digit separators like dots or commas used as thousand separators
63
+ cleaned = re.sub(r'[.,]', '', num_str)
64
+ try:
65
+ return num2words(int(cleaned), lang='id')
66
+ except:
67
+ return num_str
68
+
69
+ # helper: per-digit reader for phone numbers (default)
70
+ def read_digits_per_digit(number_str, prefix_plus=False):
71
+ digits = re.findall(r'\d', number_str)
72
+ words = " ".join(num2words(int(d), lang='id') for d in digits)
73
+ if prefix_plus:
74
+ return "plus " + words
75
+ return words
76
+
77
+ # noise removal rule for tokens like 'yyy6yy' or other long mixed garbage:
78
+ def is_noise_token(tok):
79
+ # remove tokens that:
80
+ # - length >=4 and contain at least one digit and at least one letter (typical ASR/keyboard noise)
81
+ # - or tokens of a single repeated char length >=4 (e.g., 'aaaa', '!!!!!!' but punctuation handled earlier)
82
+ if len(tok) < 4:
83
+ return False
84
+ if re.search(r'[A-Za-z]', tok) and re.search(r'\d', tok):
85
+ return True
86
+ if re.fullmatch(r'(.)\1{3,}', tok): # same char repeated >=4
87
+ return True
88
+ return False
89
+
90
+ # --- 新增:标点规范化函数 ---
91
+ def punctuation_normalize(text):
92
+ """
93
+ - 替换除 . , ! ? 之外的所有标点为逗号
94
+ - 统一多重逗号为单逗号
95
+ - 去掉开头多余逗号、省略号
96
+ - 逗号后空格规范化
97
+ """
98
+ # 替换括号、引号、冒号、分号、破折号、省略号等为逗号
99
+ text = re.sub(r'[:;()\[\]{}"“”«»…—–/\\]', ',', text)
100
+ # 多个逗号替换成一个
101
+ text = re.sub(r',+', ',', text)
102
+ # 开头去掉逗号和省略号
103
+ text = re.sub(r'^(,|\.\.\.|…)+\s*', '', text)
104
+ # 逗号后空格规范
105
+ text = re.sub(r'\s*,\s*', ', ', text)
106
+ # 多余空白合并
107
+ text = re.sub(r'\s+', ' ', text).strip()
108
+ return text
109
+
110
+
111
+ def normalize_id_tts(text):
112
+ """
113
+ Main normalization pipeline tailored for:
114
+ - Indonesian YouTube subtitles (mostly ASR/MT)
115
+ - TTS frontend requirements:
116
+ * Remove emojis
117
+ * Keep . , ! ? as sentence/phrase delimiters
118
+ * Replace other punctuation with comma
119
+ * Expand numbers, percents, currency, units, times, dates
120
+ * Remove keyboard noise like 'yyy6yy'
121
+ * Keep English words as-is
122
+ * Keep repeated words (do not collapse)
123
+ """
124
+ if not text:
125
+ return text
126
+
127
+ # 1) Normalize whitespace and trim
128
+ text = text.strip()
129
+ text = re.sub(r'\s+', ' ', text)
130
+
131
+ # 2) Remove emojis
132
+ text = EMOJI_PATTERN.sub('', text)
133
+
134
+ # 3) 标点规范化(替代原有 PUNCT_TO_COMMA 替换)
135
+ text = punctuation_normalize(text)
136
+
137
+ # 保护时间和日期的代码(防止被逗号破坏)
138
+ text = re.sub(r'(\d{1,2}):(\d{2})', lambda m: f"__TIME_{m.group(1)}_{m.group(2)}__", text)
139
+ text = re.sub(r'(\d{1,4})[\/-](\d{1,2})[\/-](\d{1,4})', lambda m: f"__DATE_{m.group(1)}_{m.group(2)}_{m.group(3)}__", text)
140
+
141
+ # 恢复时间日期标记
142
+ text = re.sub(r'__TIME_(\d{1,2})_(\d{2})__', lambda m: f"{m.group(1)}:{m.group(2)}", text)
143
+ text = re.sub(r'__DATE_(\d{1,4})_(\d{1,2})_(\d{1,4})__', lambda m: f"{m.group(1)}/{m.group(2)}/{m.group(3)}", text)
144
+
145
+ # 4) Tokenize loosely by spaces and punctuation
146
+ tokens = re.split(r'(\s+|[,.!?])', text) # keep delimiters
147
+
148
+ out_tokens = []
149
+ for tok in tokens:
150
+ if not tok or tok.isspace():
151
+ out_tokens.append(tok)
152
+ continue
153
+
154
+ # keep punctuation .,!? as-is
155
+ if tok in ['.', ',', '!', '?']:
156
+ out_tokens.append(tok)
157
+ continue
158
+
159
+ # remove any remaining emojis or control chars
160
+ if EMOJI_PATTERN.search(tok):
161
+ continue
162
+
163
+ # slang normalization
164
+ lower_tok = tok.lower()
165
+ if lower_tok in SLANG_MAP:
166
+ out_tokens.append(SLANG_MAP[lower_tok])
167
+ continue
168
+
169
+ # remove noise tokens
170
+ if is_noise_token(tok):
171
+ continue
172
+
173
+ # currency: Rp 10.000 or rp10.000
174
+ m = re.match(r'^(Rp|rp)\s*([0-9\.,]+)$', tok)
175
+ if m:
176
+ num = m.group(2)
177
+ cleaned = re.sub(r'[.,]', '', num)
178
+ out_tokens.append(f"{num_to_words_ind(cleaned)} rupiah")
179
+ continue
180
+
181
+ # percent like 30%
182
+ m = re.match(r'^(\d+)%$', tok)
183
+ if m:
184
+ out_tokens.append(f"{num_to_words_ind(m.group(1))} persen")
185
+ continue
186
+
187
+ # phone numbers +62..., 0812...
188
+ m = re.match(r'^\+?\d[\d\-\s]{6,}\d$', tok)
189
+ if m:
190
+ prefix_plus = tok.startswith('+')
191
+ out_tokens.append(read_digits_per_digit(tok, prefix_plus=prefix_plus))
192
+ continue
193
+
194
+ # time hh:mm
195
+ m = re.match(r'^(\d{1,2}):(\d{2})$', tok)
196
+ if m:
197
+ h, mi = m.group(1), m.group(2)
198
+ h_w = num_to_words_ind(h.lstrip('0') or '0')
199
+ mi_w = num_to_words_ind(mi.lstrip('0') or '0')
200
+ out_tokens.append(f"pukul {h_w} lewat {mi_w} menit")
201
+ continue
202
+
203
+ # date yyyy/mm/dd or dd/mm/yyyy
204
+ m = re.match(r'^(\d{1,4})\/(\d{1,2})\/(\d{1,4})$', tok)
205
+ if m:
206
+ a,b,c = m.group(1), m.group(2).zfill(2), m.group(3)
207
+ if len(a) == 4:
208
+ year, month, day = a, b, c
209
+ elif len(c) == 4:
210
+ day, month, year = a, b, c
211
+ else:
212
+ day, month, year = a, b, c
213
+ MONTHS = {
214
+ "01": "Januari","02": "Februari","03": "Maret","04": "April",
215
+ "05": "Mei","06": "Juni","07": "Juli","08": "Agustus",
216
+ "09": "September","10": "Oktober","11": "November","12": "Desember"
217
+ }
218
+ day_w = num_to_words_ind(day.lstrip('0') or '0')
219
+ year_w = num_to_words_ind(year)
220
+ month_name = MONTHS.get(month, month)
221
+ out_tokens.append(f"{day_w} {month_name} {year_w}")
222
+ continue
223
+
224
+ # units like 30kg
225
+ m = re.match(r'^(\d+)\s*(kg|g|km|m|cm|mm|l)$', tok, flags=re.I)
226
+ if m:
227
+ num, unit = m.group(1), m.group(2).lower()
228
+ unit_word = UNITS.get(unit, unit)
229
+ out_tokens.append(f"{num_to_words_ind(num)} {unit_word}")
230
+ continue
231
+
232
+ # plain integers
233
+ if re.fullmatch(r'\d+', tok):
234
+ out_tokens.append(num_to_words_ind(tok))
235
+ continue
236
+
237
+ # numbers with separators
238
+ if re.fullmatch(r'[\d\.,]+', tok) and re.search(r'[.,]', tok):
239
+ out_tokens.append(num_to_words_ind(tok))
240
+ continue
241
+
242
+ # keep English/as-is tokens
243
+ out_tokens.append(tok)
244
+
245
+ normalized = "".join(out_tokens)
246
+
247
+ # final cleanup: spacing around punctuation
248
+ normalized = re.sub(r'\s+,', ',', normalized)
249
+ normalized = re.sub(r',\s*', ', ', normalized)
250
+ normalized = re.sub(r'\s+\.', '.', normalized)
251
+ normalized = re.sub(r'\s+!', '!', normalized)
252
+ normalized = re.sub(r'\s+\?', '?', normalized)
253
+ normalized = re.sub(r'\s+', ' ', normalized).strip()
254
+
255
+ # 如果你不想全部小写,注释掉下面这行
256
+ normalized = normalized.lower()
257
+
258
+ return normalized
259
+
260
+ # -------------------------
261
+ # Example usage and tests
262
+ # -------------------------
263
+ if __name__ == "__main__":
264
+ examples = [
265
+ "kita cek Project nadi PHP pemberi harapan palsu tuh yyy6yy 46 ini ini usernya ini di bagian user",
266
+ "Harga Rp 10.000, diskon 30%! Buka jam 09:30 (hari 2025/11/28).",
267
+ "Call +62 812-3456-7890 sekarang!",
268
+ "angka kecil 3.14 dan 1,234 serta 1000",
269
+ "[musik]",
270
+ "... atau mungkin juga jumlah anggota keluarga mereka."
271
+ ]
272
+ for ex in examples:
273
+ print("IN: ", ex)
274
+ print("OUT:", normalize_id_tts(ex))
275
+ print("-"*60)
lemas_tts/infer/text_norm/jieba_dict.txt ADDED
The diff for this file is too large to render. See raw diff
 
lemas_tts/infer/text_norm/pinyin-lexicon-r.txt ADDED
@@ -0,0 +1,4120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a1 a1
2
+ a2 a2
3
+ a3 a3
4
+ a4 a4
5
+ a5 a5
6
+ ai1 ai1
7
+ ai2 ai2
8
+ ai3 ai3
9
+ ai4 ai4
10
+ ai5 ai5
11
+ an1 an1
12
+ an2 an2
13
+ an3 an3
14
+ an4 an4
15
+ an5 an5
16
+ ang1 ang1
17
+ ang2 ang2
18
+ ang3 ang3
19
+ ang4 ang4
20
+ ang5 ang5
21
+ ao1 ao1
22
+ ao2 ao2
23
+ ao3 ao3
24
+ ao4 ao4
25
+ ao5 ao5
26
+ ba1 b a1
27
+ ba2 b a2
28
+ ba3 b a3
29
+ ba4 b a4
30
+ ba5 b a5
31
+ bai1 b ai1
32
+ bai2 b ai2
33
+ bai3 b ai3
34
+ bai4 b ai4
35
+ bai5 b ai5
36
+ ban1 b an1
37
+ ban2 b an2
38
+ ban3 b an3
39
+ ban4 b an4
40
+ ban5 b an5
41
+ bang1 b ang1
42
+ bang2 b ang2
43
+ bang3 b ang3
44
+ bang4 b ang4
45
+ bang5 b ang5
46
+ bao1 b ao1
47
+ bao2 b ao2
48
+ bao3 b ao3
49
+ bao4 b ao4
50
+ bao5 b ao5
51
+ bei1 b ei1
52
+ bei2 b ei2
53
+ bei3 b ei3
54
+ bei4 b ei4
55
+ bei5 b ei5
56
+ ben1 b en1
57
+ ben2 b en2
58
+ ben3 b en3
59
+ ben4 b en4
60
+ ben5 b en5
61
+ beng1 b eng1
62
+ beng2 b eng2
63
+ beng3 b eng3
64
+ beng4 b eng4
65
+ beng5 b eng5
66
+ bi1 b i1
67
+ bi2 b i2
68
+ bi3 b i3
69
+ bi4 b i4
70
+ bi5 b i5
71
+ bian1 b ian1
72
+ bian2 b ian2
73
+ bian3 b ian3
74
+ bian4 b ian4
75
+ bian5 b ian5
76
+ biao1 b iao1
77
+ biao2 b iao2
78
+ biao3 b iao3
79
+ biao4 b iao4
80
+ biao5 b iao5
81
+ bie1 b ie1
82
+ bie2 b ie2
83
+ bie3 b ie3
84
+ bie4 b ie4
85
+ bie5 b ie5
86
+ bin1 b in1
87
+ bin2 b in2
88
+ bin3 b in3
89
+ bin4 b in4
90
+ bin5 b in5
91
+ bing1 b ing1
92
+ bing2 b ing2
93
+ bing3 b ing3
94
+ bing4 b ing4
95
+ bing5 b ing5
96
+ bo1 b o1
97
+ bo2 b o2
98
+ bo3 b o3
99
+ bo4 b o4
100
+ bo5 b o5
101
+ bu1 b u1
102
+ bu2 b u2
103
+ bu3 b u3
104
+ bu4 b u4
105
+ bu5 b u5
106
+ ca1 c a1
107
+ ca2 c a2
108
+ ca3 c a3
109
+ ca4 c a4
110
+ ca5 c a5
111
+ cai1 c ai1
112
+ cai2 c ai2
113
+ cai3 c ai3
114
+ cai4 c ai4
115
+ cai5 c ai5
116
+ can1 c an1
117
+ can2 c an2
118
+ can3 c an3
119
+ can4 c an4
120
+ can5 c an5
121
+ cang1 c ang1
122
+ cang2 c ang2
123
+ cang3 c ang3
124
+ cang4 c ang4
125
+ cang5 c ang5
126
+ cao1 c ao1
127
+ cao2 c ao2
128
+ cao3 c ao3
129
+ cao4 c ao4
130
+ cao5 c ao5
131
+ ce1 c e1
132
+ ce2 c e2
133
+ ce3 c e3
134
+ ce4 c e4
135
+ ce5 c e5
136
+ cen1 c en1
137
+ cen2 c en2
138
+ cen3 c en3
139
+ cen4 c en4
140
+ cen5 c en5
141
+ ceng1 c eng1
142
+ ceng2 c eng2
143
+ ceng3 c eng3
144
+ ceng4 c eng4
145
+ ceng5 c eng5
146
+ cha1 ch a1
147
+ cha2 ch a2
148
+ cha3 ch a3
149
+ cha4 ch a4
150
+ cha5 ch a5
151
+ chai1 ch ai1
152
+ chai2 ch ai2
153
+ chai3 ch ai3
154
+ chai4 ch ai4
155
+ chai5 ch ai5
156
+ chan1 ch an1
157
+ chan2 ch an2
158
+ chan3 ch an3
159
+ chan4 ch an4
160
+ chan5 ch an5
161
+ chang1 ch ang1
162
+ chang2 ch ang2
163
+ chang3 ch ang3
164
+ chang4 ch ang4
165
+ chang5 ch ang5
166
+ chao1 ch ao1
167
+ chao2 ch ao2
168
+ chao3 ch ao3
169
+ chao4 ch ao4
170
+ chao5 ch ao5
171
+ che1 ch e1
172
+ che2 ch e2
173
+ che3 ch e3
174
+ che4 ch e4
175
+ che5 ch e5
176
+ chen1 ch en1
177
+ chen2 ch en2
178
+ chen3 ch en3
179
+ chen4 ch en4
180
+ chen5 ch en5
181
+ cheng1 ch eng1
182
+ cheng2 ch eng2
183
+ cheng3 ch eng3
184
+ cheng4 ch eng4
185
+ cheng5 ch eng5
186
+ chi1 ch iii1
187
+ chi2 ch iii2
188
+ chi3 ch iii3
189
+ chi4 ch iii4
190
+ chi5 ch iii5
191
+ chong1 ch ong1
192
+ chong2 ch ong2
193
+ chong3 ch ong3
194
+ chong4 ch ong4
195
+ chong5 ch ong5
196
+ chou1 ch ou1
197
+ chou2 ch ou2
198
+ chou3 ch ou3
199
+ chou4 ch ou4
200
+ chou5 ch ou5
201
+ chu1 ch u1
202
+ chu2 ch u2
203
+ chu3 ch u3
204
+ chu4 ch u4
205
+ chu5 ch u5
206
+ chuai1 ch uai1
207
+ chuai2 ch uai2
208
+ chuai3 ch uai3
209
+ chuai4 ch uai4
210
+ chuai5 ch uai5
211
+ chuan1 ch uan1
212
+ chuan2 ch uan2
213
+ chuan3 ch uan3
214
+ chuan4 ch uan4
215
+ chuan5 ch uan5
216
+ chuang1 ch uang1
217
+ chuang2 ch uang2
218
+ chuang3 ch uang3
219
+ chuang4 ch uang4
220
+ chuang5 ch uang5
221
+ chui1 ch uei1
222
+ chui2 ch uei2
223
+ chui3 ch uei3
224
+ chui4 ch uei4
225
+ chui5 ch uei5
226
+ chun1 ch uen1
227
+ chun2 ch uen2
228
+ chun3 ch uen3
229
+ chun4 ch uen4
230
+ chun5 ch uen5
231
+ chuo1 ch uo1
232
+ chuo2 ch uo2
233
+ chuo3 ch uo3
234
+ chuo4 ch uo4
235
+ chuo5 ch uo5
236
+ ci1 c ii1
237
+ ci2 c ii2
238
+ ci3 c ii3
239
+ ci4 c ii4
240
+ ci5 c ii5
241
+ cong1 c ong1
242
+ cong2 c ong2
243
+ cong3 c ong3
244
+ cong4 c ong4
245
+ cong5 c ong5
246
+ cou1 c ou1
247
+ cou2 c ou2
248
+ cou3 c ou3
249
+ cou4 c ou4
250
+ cou5 c ou5
251
+ cu1 c u1
252
+ cu2 c u2
253
+ cu3 c u3
254
+ cu4 c u4
255
+ cu5 c u5
256
+ cuan1 c uan1
257
+ cuan2 c uan2
258
+ cuan3 c uan3
259
+ cuan4 c uan4
260
+ cuan5 c uan5
261
+ cui1 c uei1
262
+ cui2 c uei2
263
+ cui3 c uei3
264
+ cui4 c uei4
265
+ cui5 c uei5
266
+ cun1 c uen1
267
+ cun2 c uen2
268
+ cun3 c uen3
269
+ cun4 c uen4
270
+ cun5 c uen5
271
+ cuo1 c uo1
272
+ cuo2 c uo2
273
+ cuo3 c uo3
274
+ cuo4 c uo4
275
+ cuo5 c uo5
276
+ da1 d a1
277
+ da2 d a2
278
+ da3 d a3
279
+ da4 d a4
280
+ da5 d a5
281
+ dai1 d ai1
282
+ dai2 d ai2
283
+ dai3 d ai3
284
+ dai4 d ai4
285
+ dai5 d ai5
286
+ dan1 d an1
287
+ dan2 d an2
288
+ dan3 d an3
289
+ dan4 d an4
290
+ dan5 d an5
291
+ dang1 d ang1
292
+ dang2 d ang2
293
+ dang3 d ang3
294
+ dang4 d ang4
295
+ dang5 d ang5
296
+ dao1 d ao1
297
+ dao2 d ao2
298
+ dao3 d ao3
299
+ dao4 d ao4
300
+ dao5 d ao5
301
+ de1 d e1
302
+ de2 d e2
303
+ de3 d e3
304
+ de4 d e4
305
+ de5 d e5
306
+ dei1 d ei1
307
+ dei2 d ei2
308
+ dei3 d ei3
309
+ dei4 d ei4
310
+ dei5 d ei5
311
+ den1 d en1
312
+ den2 d en2
313
+ den3 d en3
314
+ den4 d en4
315
+ den5 d en5
316
+ deng1 d eng1
317
+ deng2 d eng2
318
+ deng3 d eng3
319
+ deng4 d eng4
320
+ deng5 d eng5
321
+ di1 d i1
322
+ di2 d i2
323
+ di3 d i3
324
+ di4 d i4
325
+ di5 d i5
326
+ dia1 d ia1
327
+ dia2 d ia2
328
+ dia3 d ia3
329
+ dia4 d ia4
330
+ dia5 d ia5
331
+ dian1 d ian1
332
+ dian2 d ian2
333
+ dian3 d ian3
334
+ dian4 d ian4
335
+ dian5 d ian5
336
+ diao1 d iao1
337
+ diao2 d iao2
338
+ diao3 d iao3
339
+ diao4 d iao4
340
+ diao5 d iao5
341
+ die1 d ie1
342
+ die2 d ie2
343
+ die3 d ie3
344
+ die4 d ie4
345
+ die5 d ie5
346
+ ding1 d ing1
347
+ ding2 d ing2
348
+ ding3 d ing3
349
+ ding4 d ing4
350
+ ding5 d ing5
351
+ diu1 d iou1
352
+ diu2 d iou2
353
+ diu3 d iou3
354
+ diu4 d iou4
355
+ diu5 d iou5
356
+ dong1 d ong1
357
+ dong2 d ong2
358
+ dong3 d ong3
359
+ dong4 d ong4
360
+ dong5 d ong5
361
+ dou1 d ou1
362
+ dou2 d ou2
363
+ dou3 d ou3
364
+ dou4 d ou4
365
+ dou5 d ou5
366
+ du1 d u1
367
+ du2 d u2
368
+ du3 d u3
369
+ du4 d u4
370
+ du5 d u5
371
+ duan1 d uan1
372
+ duan2 d uan2
373
+ duan3 d uan3
374
+ duan4 d uan4
375
+ duan5 d uan5
376
+ dui1 d uei1
377
+ dui2 d uei2
378
+ dui3 d uei3
379
+ dui4 d uei4
380
+ dui5 d uei5
381
+ dun1 d uen1
382
+ dun2 d uen2
383
+ dun3 d uen3
384
+ dun4 d uen4
385
+ dun5 d uen5
386
+ duo1 d uo1
387
+ duo2 d uo2
388
+ duo3 d uo3
389
+ duo4 d uo4
390
+ duo5 d uo5
391
+ e1 e1
392
+ e2 e2
393
+ e3 e3
394
+ e4 e4
395
+ e5 e5
396
+ ei1 ei1
397
+ ei2 ei2
398
+ ei3 ei3
399
+ ei4 ei4
400
+ ei5 ei5
401
+ en1 en1
402
+ en2 en2
403
+ en3 en3
404
+ en4 en4
405
+ en5 en5
406
+ eng1 eng1
407
+ eng2 eng2
408
+ eng3 eng3
409
+ eng4 eng4
410
+ eng5 eng5
411
+ r1 er1
412
+ r2 er2
413
+ r3 er3
414
+ r4 er4
415
+ r5 er5
416
+ er1 er1
417
+ er2 er2
418
+ er3 er3
419
+ er4 er4
420
+ er5 er5
421
+ fa1 f a1
422
+ fa2 f a2
423
+ fa3 f a3
424
+ fa4 f a4
425
+ fa5 f a5
426
+ fan1 f an1
427
+ fan2 f an2
428
+ fan3 f an3
429
+ fan4 f an4
430
+ fan5 f an5
431
+ fang1 f ang1
432
+ fang2 f ang2
433
+ fang3 f ang3
434
+ fang4 f ang4
435
+ fang5 f ang5
436
+ fei1 f ei1
437
+ fei2 f ei2
438
+ fei3 f ei3
439
+ fei4 f ei4
440
+ fei5 f ei5
441
+ fen1 f en1
442
+ fen2 f en2
443
+ fen3 f en3
444
+ fen4 f en4
445
+ fen5 f en5
446
+ feng1 f eng1
447
+ feng2 f eng2
448
+ feng3 f eng3
449
+ feng4 f eng4
450
+ feng5 f eng5
451
+ fo1 f o1
452
+ fo2 f o2
453
+ fo3 f o3
454
+ fo4 f o4
455
+ fo5 f o5
456
+ fou1 f ou1
457
+ fou2 f ou2
458
+ fou3 f ou3
459
+ fou4 f ou4
460
+ fou5 f ou5
461
+ fu1 f u1
462
+ fu2 f u2
463
+ fu3 f u3
464
+ fu4 f u4
465
+ fu5 f u5
466
+ ga1 g a1
467
+ ga2 g a2
468
+ ga3 g a3
469
+ ga4 g a4
470
+ ga5 g a5
471
+ gai1 g ai1
472
+ gai2 g ai2
473
+ gai3 g ai3
474
+ gai4 g ai4
475
+ gai5 g ai5
476
+ gan1 g an1
477
+ gan2 g an2
478
+ gan3 g an3
479
+ gan4 g an4
480
+ gan5 g an5
481
+ gang1 g ang1
482
+ gang2 g ang2
483
+ gang3 g ang3
484
+ gang4 g ang4
485
+ gang5 g ang5
486
+ gao1 g ao1
487
+ gao2 g ao2
488
+ gao3 g ao3
489
+ gao4 g ao4
490
+ gao5 g ao5
491
+ ge1 g e1
492
+ ge2 g e2
493
+ ge3 g e3
494
+ ge4 g e4
495
+ ge5 g e5
496
+ gei1 g ei1
497
+ gei2 g ei2
498
+ gei3 g ei3
499
+ gei4 g ei4
500
+ gei5 g ei5
501
+ gen1 g en1
502
+ gen2 g en2
503
+ gen3 g en3
504
+ gen4 g en4
505
+ gen5 g en5
506
+ geng1 g eng1
507
+ geng2 g eng2
508
+ geng3 g eng3
509
+ geng4 g eng4
510
+ geng5 g eng5
511
+ gong1 g ong1
512
+ gong2 g ong2
513
+ gong3 g ong3
514
+ gong4 g ong4
515
+ gong5 g ong5
516
+ gou1 g ou1
517
+ gou2 g ou2
518
+ gou3 g ou3
519
+ gou4 g ou4
520
+ gou5 g ou5
521
+ gu1 g u1
522
+ gu2 g u2
523
+ gu3 g u3
524
+ gu4 g u4
525
+ gu5 g u5
526
+ gua1 g ua1
527
+ gua2 g ua2
528
+ gua3 g ua3
529
+ gua4 g ua4
530
+ gua5 g ua5
531
+ guai1 g uai1
532
+ guai2 g uai2
533
+ guai3 g uai3
534
+ guai4 g uai4
535
+ guai5 g uai5
536
+ guan1 g uan1
537
+ guan2 g uan2
538
+ guan3 g uan3
539
+ guan4 g uan4
540
+ guan5 g uan5
541
+ guang1 g uang1
542
+ guang2 g uang2
543
+ guang3 g uang3
544
+ guang4 g uang4
545
+ guang5 g uang5
546
+ gui1 g uei1
547
+ gui2 g uei2
548
+ gui3 g uei3
549
+ gui4 g uei4
550
+ gui5 g uei5
551
+ gun1 g uen1
552
+ gun2 g uen2
553
+ gun3 g uen3
554
+ gun4 g uen4
555
+ gun5 g uen5
556
+ guo1 g uo1
557
+ guo2 g uo2
558
+ guo3 g uo3
559
+ guo4 g uo4
560
+ guo5 g uo5
561
+ ha1 h a1
562
+ ha2 h a2
563
+ ha3 h a3
564
+ ha4 h a4
565
+ ha5 h a5
566
+ hai1 h ai1
567
+ hai2 h ai2
568
+ hai3 h ai3
569
+ hai4 h ai4
570
+ hai5 h ai5
571
+ han1 h an1
572
+ han2 h an2
573
+ han3 h an3
574
+ han4 h an4
575
+ han5 h an5
576
+ hang1 h ang1
577
+ hang2 h ang2
578
+ hang3 h ang3
579
+ hang4 h ang4
580
+ hang5 h ang5
581
+ hao1 h ao1
582
+ hao2 h ao2
583
+ hao3 h ao3
584
+ hao4 h ao4
585
+ hao5 h ao5
586
+ he1 h e1
587
+ he2 h e2
588
+ he3 h e3
589
+ he4 h e4
590
+ he5 h e5
591
+ hei1 h ei1
592
+ hei2 h ei2
593
+ hei3 h ei3
594
+ hei4 h ei4
595
+ hei5 h ei5
596
+ hen1 h en1
597
+ hen2 h en2
598
+ hen3 h en3
599
+ hen4 h en4
600
+ hen5 h en5
601
+ heng1 h eng1
602
+ heng2 h eng2
603
+ heng3 h eng3
604
+ heng4 h eng4
605
+ heng5 h eng5
606
+ hong1 h ong1
607
+ hong2 h ong2
608
+ hong3 h ong3
609
+ hong4 h ong4
610
+ hong5 h ong5
611
+ hou1 h ou1
612
+ hou2 h ou2
613
+ hou3 h ou3
614
+ hou4 h ou4
615
+ hou5 h ou5
616
+ hu1 h u1
617
+ hu2 h u2
618
+ hu3 h u3
619
+ hu4 h u4
620
+ hu5 h u5
621
+ hua1 h ua1
622
+ hua2 h ua2
623
+ hua3 h ua3
624
+ hua4 h ua4
625
+ hua5 h ua5
626
+ huai1 h uai1
627
+ huai2 h uai2
628
+ huai3 h uai3
629
+ huai4 h uai4
630
+ huai5 h uai5
631
+ huan1 h uan1
632
+ huan2 h uan2
633
+ huan3 h uan3
634
+ huan4 h uan4
635
+ huan5 h uan5
636
+ huang1 h uang1
637
+ huang2 h uang2
638
+ huang3 h uang3
639
+ huang4 h uang4
640
+ huang5 h uang5
641
+ hui1 h uei1
642
+ hui2 h uei2
643
+ hui3 h uei3
644
+ hui4 h uei4
645
+ hui5 h uei5
646
+ hun1 h uen1
647
+ hun2 h uen2
648
+ hun3 h uen3
649
+ hun4 h uen4
650
+ hun5 h uen5
651
+ huo1 h uo1
652
+ huo2 h uo2
653
+ huo3 h uo3
654
+ huo4 h uo4
655
+ huo5 h uo5
656
+ ji1 j i1
657
+ ji2 j i2
658
+ ji3 j i3
659
+ ji4 j i4
660
+ ji5 j i5
661
+ jia1 j ia1
662
+ jia2 j ia2
663
+ jia3 j ia3
664
+ jia4 j ia4
665
+ jia5 j ia5
666
+ jian1 j ian1
667
+ jian2 j ian2
668
+ jian3 j ian3
669
+ jian4 j ian4
670
+ jian5 j ian5
671
+ jiang1 j iang1
672
+ jiang2 j iang2
673
+ jiang3 j iang3
674
+ jiang4 j iang4
675
+ jiang5 j iang5
676
+ jiao1 j iao1
677
+ jiao2 j iao2
678
+ jiao3 j iao3
679
+ jiao4 j iao4
680
+ jiao5 j iao5
681
+ jie1 j ie1
682
+ jie2 j ie2
683
+ jie3 j ie3
684
+ jie4 j ie4
685
+ jie5 j ie5
686
+ jin1 j in1
687
+ jin2 j in2
688
+ jin3 j in3
689
+ jin4 j in4
690
+ jin5 j in5
691
+ jing1 j ing1
692
+ jing2 j ing2
693
+ jing3 j ing3
694
+ jing4 j ing4
695
+ jing5 j ing5
696
+ jiong1 j iong1
697
+ jiong2 j iong2
698
+ jiong3 j iong3
699
+ jiong4 j iong4
700
+ jiong5 j iong5
701
+ jiu1 j iou1
702
+ jiu2 j iou2
703
+ jiu3 j iou3
704
+ jiu4 j iou4
705
+ jiu5 j iou5
706
+ ju1 j v1
707
+ ju2 j v2
708
+ ju3 j v3
709
+ ju4 j v4
710
+ ju5 j v5
711
+ juan1 j van1
712
+ juan2 j van2
713
+ juan3 j van3
714
+ juan4 j van4
715
+ juan5 j van5
716
+ jue1 j ve1
717
+ jue2 j ve2
718
+ jue3 j ve3
719
+ jue4 j ve4
720
+ jue5 j ve5
721
+ jun1 j vn1
722
+ jun2 j vn2
723
+ jun3 j vn3
724
+ jun4 j vn4
725
+ jun5 j vn5
726
+ ka1 k a1
727
+ ka2 k a2
728
+ ka3 k a3
729
+ ka4 k a4
730
+ ka5 k a5
731
+ kai1 k ai1
732
+ kai2 k ai2
733
+ kai3 k ai3
734
+ kai4 k ai4
735
+ kai5 k ai5
736
+ kan1 k an1
737
+ kan2 k an2
738
+ kan3 k an3
739
+ kan4 k an4
740
+ kan5 k an5
741
+ kang1 k ang1
742
+ kang2 k ang2
743
+ kang3 k ang3
744
+ kang4 k ang4
745
+ kang5 k ang5
746
+ kao1 k ao1
747
+ kao2 k ao2
748
+ kao3 k ao3
749
+ kao4 k ao4
750
+ kao5 k ao5
751
+ ke1 k e1
752
+ ke2 k e2
753
+ ke3 k e3
754
+ ke4 k e4
755
+ ke5 k e5
756
+ kei1 k ei1
757
+ kei2 k ei2
758
+ kei3 k ei3
759
+ kei4 k ei4
760
+ kei5 k ei5
761
+ ken1 k en1
762
+ ken2 k en2
763
+ ken3 k en3
764
+ ken4 k en4
765
+ ken5 k en5
766
+ keng1 k eng1
767
+ keng2 k eng2
768
+ keng3 k eng3
769
+ keng4 k eng4
770
+ keng5 k eng5
771
+ kong1 k ong1
772
+ kong2 k ong2
773
+ kong3 k ong3
774
+ kong4 k ong4
775
+ kong5 k ong5
776
+ kou1 k ou1
777
+ kou2 k ou2
778
+ kou3 k ou3
779
+ kou4 k ou4
780
+ kou5 k ou5
781
+ ku1 k u1
782
+ ku2 k u2
783
+ ku3 k u3
784
+ ku4 k u4
785
+ ku5 k u5
786
+ kua1 k ua1
787
+ kua2 k ua2
788
+ kua3 k ua3
789
+ kua4 k ua4
790
+ kua5 k ua5
791
+ kuai1 k uai1
792
+ kuai2 k uai2
793
+ kuai3 k uai3
794
+ kuai4 k uai4
795
+ kuai5 k uai5
796
+ kuan1 k uan1
797
+ kuan2 k uan2
798
+ kuan3 k uan3
799
+ kuan4 k uan4
800
+ kuan5 k uan5
801
+ kuang1 k uang1
802
+ kuang2 k uang2
803
+ kuang3 k uang3
804
+ kuang4 k uang4
805
+ kuang5 k uang5
806
+ kui1 k uei1
807
+ kui2 k uei2
808
+ kui3 k uei3
809
+ kui4 k uei4
810
+ kui5 k uei5
811
+ kun1 k uen1
812
+ kun2 k uen2
813
+ kun3 k uen3
814
+ kun4 k uen4
815
+ kun5 k uen5
816
+ kuo1 k uo1
817
+ kuo2 k uo2
818
+ kuo3 k uo3
819
+ kuo4 k uo4
820
+ kuo5 k uo5
821
+ la1 l a1
822
+ la2 l a2
823
+ la3 l a3
824
+ la4 l a4
825
+ la5 l a5
826
+ lai1 l ai1
827
+ lai2 l ai2
828
+ lai3 l ai3
829
+ lai4 l ai4
830
+ lai5 l ai5
831
+ lan1 l an1
832
+ lan2 l an2
833
+ lan3 l an3
834
+ lan4 l an4
835
+ lan5 l an5
836
+ lang1 l ang1
837
+ lang2 l ang2
838
+ lang3 l ang3
839
+ lang4 l ang4
840
+ lang5 l ang5
841
+ lao1 l ao1
842
+ lao2 l ao2
843
+ lao3 l ao3
844
+ lao4 l ao4
845
+ lao5 l ao5
846
+ le1 l e1
847
+ le2 l e2
848
+ le3 l e3
849
+ le4 l e4
850
+ le5 l e5
851
+ lei1 l ei1
852
+ lei2 l ei2
853
+ lei3 l ei3
854
+ lei4 l ei4
855
+ lei5 l ei5
856
+ leng1 l eng1
857
+ leng2 l eng2
858
+ leng3 l eng3
859
+ leng4 l eng4
860
+ leng5 l eng5
861
+ li1 l i1
862
+ li2 l i2
863
+ li3 l i3
864
+ li4 l i4
865
+ li5 l i5
866
+ lia1 l ia1
867
+ lia2 l ia2
868
+ lia3 l ia3
869
+ lia4 l ia4
870
+ lia5 l ia5
871
+ lian1 l ian1
872
+ lian2 l ian2
873
+ lian3 l ian3
874
+ lian4 l ian4
875
+ lian5 l ian5
876
+ liang1 l iang1
877
+ liang2 l iang2
878
+ liang3 l iang3
879
+ liang4 l iang4
880
+ liang5 l iang5
881
+ liao1 l iao1
882
+ liao2 l iao2
883
+ liao3 l iao3
884
+ liao4 l iao4
885
+ liao5 l iao5
886
+ lie1 l ie1
887
+ lie2 l ie2
888
+ lie3 l ie3
889
+ lie4 l ie4
890
+ lie5 l ie5
891
+ lin1 l in1
892
+ lin2 l in2
893
+ lin3 l in3
894
+ lin4 l in4
895
+ lin5 l in5
896
+ ling1 l ing1
897
+ ling2 l ing2
898
+ ling3 l ing3
899
+ ling4 l ing4
900
+ ling5 l ing5
901
+ liu1 l iou1
902
+ liu2 l iou2
903
+ liu3 l iou3
904
+ liu4 l iou4
905
+ liu5 l iou5
906
+ lo1 l o1
907
+ lo2 l o2
908
+ lo3 l o3
909
+ lo4 l o4
910
+ lo5 l o5
911
+ long1 l ong1
912
+ long2 l ong2
913
+ long3 l ong3
914
+ long4 l ong4
915
+ long5 l ong5
916
+ lou1 l ou1
917
+ lou2 l ou2
918
+ lou3 l ou3
919
+ lou4 l ou4
920
+ lou5 l ou5
921
+ lu1 l u1
922
+ lu2 l u2
923
+ lu3 l u3
924
+ lu4 l u4
925
+ lu5 l u5
926
+ luan1 l uan1
927
+ luan2 l uan2
928
+ luan3 l uan3
929
+ luan4 l uan4
930
+ luan5 l uan5
931
+ lue1 l ve1
932
+ lue2 l ve2
933
+ lue3 l ve3
934
+ lue4 l ve4
935
+ lue5 l ve5
936
+ lve1 l ve1
937
+ lve2 l ve2
938
+ lve3 l ve3
939
+ lve4 l ve4
940
+ lve5 l ve5
941
+ lun1 l uen1
942
+ lun2 l uen2
943
+ lun3 l uen3
944
+ lun4 l uen4
945
+ lun5 l uen5
946
+ luo1 l uo1
947
+ luo2 l uo2
948
+ luo3 l uo3
949
+ luo4 l uo4
950
+ luo5 l uo5
951
+ lv1 l v1
952
+ lv2 l v2
953
+ lv3 l v3
954
+ lv4 l v4
955
+ lv5 l v5
956
+ ma1 m a1
957
+ ma2 m a2
958
+ ma3 m a3
959
+ ma4 m a4
960
+ ma5 m a5
961
+ mai1 m ai1
962
+ mai2 m ai2
963
+ mai3 m ai3
964
+ mai4 m ai4
965
+ mai5 m ai5
966
+ man1 m an1
967
+ man2 m an2
968
+ man3 m an3
969
+ man4 m an4
970
+ man5 m an5
971
+ mang1 m ang1
972
+ mang2 m ang2
973
+ mang3 m ang3
974
+ mang4 m ang4
975
+ mang5 m ang5
976
+ mao1 m ao1
977
+ mao2 m ao2
978
+ mao3 m ao3
979
+ mao4 m ao4
980
+ mao5 m ao5
981
+ me1 m e1
982
+ me2 m e2
983
+ me3 m e3
984
+ me4 m e4
985
+ me5 m e5
986
+ mei1 m ei1
987
+ mei2 m ei2
988
+ mei3 m ei3
989
+ mei4 m ei4
990
+ mei5 m ei5
991
+ men1 m en1
992
+ men2 m en2
993
+ men3 m en3
994
+ men4 m en4
995
+ men5 m en5
996
+ meng1 m eng1
997
+ meng2 m eng2
998
+ meng3 m eng3
999
+ meng4 m eng4
1000
+ meng5 m eng5
1001
+ mi1 m i1
1002
+ mi2 m i2
1003
+ mi3 m i3
1004
+ mi4 m i4
1005
+ mi5 m i5
1006
+ mian1 m ian1
1007
+ mian2 m ian2
1008
+ mian3 m ian3
1009
+ mian4 m ian4
1010
+ mian5 m ian5
1011
+ miao1 m iao1
1012
+ miao2 m iao2
1013
+ miao3 m iao3
1014
+ miao4 m iao4
1015
+ miao5 m iao5
1016
+ mie1 m ie1
1017
+ mie2 m ie2
1018
+ mie3 m ie3
1019
+ mie4 m ie4
1020
+ mie5 m ie5
1021
+ min1 m in1
1022
+ min2 m in2
1023
+ min3 m in3
1024
+ min4 m in4
1025
+ min5 m in5
1026
+ ming1 m ing1
1027
+ ming2 m ing2
1028
+ ming3 m ing3
1029
+ ming4 m ing4
1030
+ ming5 m ing5
1031
+ miu1 m iou1
1032
+ miu2 m iou2
1033
+ miu3 m iou3
1034
+ miu4 m iou4
1035
+ miu5 m iou5
1036
+ mo1 m o1
1037
+ mo2 m o2
1038
+ mo3 m o3
1039
+ mo4 m o4
1040
+ mo5 m o5
1041
+ mou1 m ou1
1042
+ mou2 m ou2
1043
+ mou3 m ou3
1044
+ mou4 m ou4
1045
+ mou5 m ou5
1046
+ mu1 m u1
1047
+ mu2 m u2
1048
+ mu3 m u3
1049
+ mu4 m u4
1050
+ mu5 m u5
1051
+ na1 n a1
1052
+ na2 n a2
1053
+ na3 n a3
1054
+ na4 n a4
1055
+ na5 n a5
1056
+ nai1 n ai1
1057
+ nai2 n ai2
1058
+ nai3 n ai3
1059
+ nai4 n ai4
1060
+ nai5 n ai5
1061
+ nan1 n an1
1062
+ nan2 n an2
1063
+ nan3 n an3
1064
+ nan4 n an4
1065
+ nan5 n an5
1066
+ nang1 n ang1
1067
+ nang2 n ang2
1068
+ nang3 n ang3
1069
+ nang4 n ang4
1070
+ nang5 n ang5
1071
+ nao1 n ao1
1072
+ nao2 n ao2
1073
+ nao3 n ao3
1074
+ nao4 n ao4
1075
+ nao5 n ao5
1076
+ ne1 n e1
1077
+ ne2 n e2
1078
+ ne3 n e3
1079
+ ne4 n e4
1080
+ ne5 n e5
1081
+ nei1 n ei1
1082
+ nei2 n ei2
1083
+ nei3 n ei3
1084
+ nei4 n ei4
1085
+ nei5 n ei5
1086
+ nen1 n en1
1087
+ nen2 n en2
1088
+ nen3 n en3
1089
+ nen4 n en4
1090
+ nen5 n en5
1091
+ neng1 n eng1
1092
+ neng2 n eng2
1093
+ neng3 n eng3
1094
+ neng4 n eng4
1095
+ neng5 n eng5
1096
+ ni1 n i1
1097
+ ni2 n i2
1098
+ ni3 n i3
1099
+ ni4 n i4
1100
+ ni5 n i5
1101
+ nian1 n ian1
1102
+ nian2 n ian2
1103
+ nian3 n ian3
1104
+ nian4 n ian4
1105
+ nian5 n ian5
1106
+ niang1 n iang1
1107
+ niang2 n iang2
1108
+ niang3 n iang3
1109
+ niang4 n iang4
1110
+ niang5 n iang5
1111
+ niao1 n iao1
1112
+ niao2 n iao2
1113
+ niao3 n iao3
1114
+ niao4 n iao4
1115
+ niao5 n iao5
1116
+ nie1 n ie1
1117
+ nie2 n ie2
1118
+ nie3 n ie3
1119
+ nie4 n ie4
1120
+ nie5 n ie5
1121
+ nin1 n in1
1122
+ nin2 n in2
1123
+ nin3 n in3
1124
+ nin4 n in4
1125
+ nin5 n in5
1126
+ ning1 n ing1
1127
+ ning2 n ing2
1128
+ ning3 n ing3
1129
+ ning4 n ing4
1130
+ ning5 n ing5
1131
+ niu1 n iou1
1132
+ niu2 n iou2
1133
+ niu3 n iou3
1134
+ niu4 n iou4
1135
+ niu5 n iou5
1136
+ nong1 n ong1
1137
+ nong2 n ong2
1138
+ nong3 n ong3
1139
+ nong4 n ong4
1140
+ nong5 n ong5
1141
+ nou1 n ou1
1142
+ nou2 n ou2
1143
+ nou3 n ou3
1144
+ nou4 n ou4
1145
+ nou5 n ou5
1146
+ nu1 n u1
1147
+ nu2 n u2
1148
+ nu3 n u3
1149
+ nu4 n u4
1150
+ nu5 n u5
1151
+ nuan1 n uan1
1152
+ nuan2 n uan2
1153
+ nuan3 n uan3
1154
+ nuan4 n uan4
1155
+ nuan5 n uan5
1156
+ nue1 n ve1
1157
+ nue2 n ve2
1158
+ nue3 n ve3
1159
+ nue4 n ve4
1160
+ nue5 n ve5
1161
+ nve1 n ve1
1162
+ nve2 n ve2
1163
+ nve3 n ve3
1164
+ nve4 n ve4
1165
+ nve5 n ve5
1166
+ nuo1 n uo1
1167
+ nuo2 n uo2
1168
+ nuo3 n uo3
1169
+ nuo4 n uo4
1170
+ nuo5 n uo5
1171
+ nv1 n v1
1172
+ nv2 n v2
1173
+ nv3 n v3
1174
+ nv4 n v4
1175
+ nv5 n v5
1176
+ o1 o1
1177
+ o2 o2
1178
+ o3 o3
1179
+ o4 o4
1180
+ o5 o5
1181
+ ou1 ou1
1182
+ ou2 ou2
1183
+ ou3 ou3
1184
+ ou4 ou4
1185
+ ou5 ou5
1186
+ pa1 p a1
1187
+ pa2 p a2
1188
+ pa3 p a3
1189
+ pa4 p a4
1190
+ pa5 p a5
1191
+ pai1 p ai1
1192
+ pai2 p ai2
1193
+ pai3 p ai3
1194
+ pai4 p ai4
1195
+ pai5 p ai5
1196
+ pan1 p an1
1197
+ pan2 p an2
1198
+ pan3 p an3
1199
+ pan4 p an4
1200
+ pan5 p an5
1201
+ pang1 p ang1
1202
+ pang2 p ang2
1203
+ pang3 p ang3
1204
+ pang4 p ang4
1205
+ pang5 p ang5
1206
+ pao1 p ao1
1207
+ pao2 p ao2
1208
+ pao3 p ao3
1209
+ pao4 p ao4
1210
+ pao5 p ao5
1211
+ pei1 p ei1
1212
+ pei2 p ei2
1213
+ pei3 p ei3
1214
+ pei4 p ei4
1215
+ pei5 p ei5
1216
+ pen1 p en1
1217
+ pen2 p en2
1218
+ pen3 p en3
1219
+ pen4 p en4
1220
+ pen5 p en5
1221
+ peng1 p eng1
1222
+ peng2 p eng2
1223
+ peng3 p eng3
1224
+ peng4 p eng4
1225
+ peng5 p eng5
1226
+ pi1 p i1
1227
+ pi2 p i2
1228
+ pi3 p i3
1229
+ pi4 p i4
1230
+ pi5 p i5
1231
+ pian1 p ian1
1232
+ pian2 p ian2
1233
+ pian3 p ian3
1234
+ pian4 p ian4
1235
+ pian5 p ian5
1236
+ piao1 p iao1
1237
+ piao2 p iao2
1238
+ piao3 p iao3
1239
+ piao4 p iao4
1240
+ piao5 p iao5
1241
+ pie1 p ie1
1242
+ pie2 p ie2
1243
+ pie3 p ie3
1244
+ pie4 p ie4
1245
+ pie5 p ie5
1246
+ pin1 p in1
1247
+ pin2 p in2
1248
+ pin3 p in3
1249
+ pin4 p in4
1250
+ pin5 p in5
1251
+ ping1 p ing1
1252
+ ping2 p ing2
1253
+ ping3 p ing3
1254
+ ping4 p ing4
1255
+ ping5 p ing5
1256
+ po1 p o1
1257
+ po2 p o2
1258
+ po3 p o3
1259
+ po4 p o4
1260
+ po5 p o5
1261
+ pou1 p ou1
1262
+ pou2 p ou2
1263
+ pou3 p ou3
1264
+ pou4 p ou4
1265
+ pou5 p ou5
1266
+ pu1 p u1
1267
+ pu2 p u2
1268
+ pu3 p u3
1269
+ pu4 p u4
1270
+ pu5 p u5
1271
+ qi1 q i1
1272
+ qi2 q i2
1273
+ qi3 q i3
1274
+ qi4 q i4
1275
+ qi5 q i5
1276
+ qia1 q ia1
1277
+ qia2 q ia2
1278
+ qia3 q ia3
1279
+ qia4 q ia4
1280
+ qia5 q ia5
1281
+ qian1 q ian1
1282
+ qian2 q ian2
1283
+ qian3 q ian3
1284
+ qian4 q ian4
1285
+ qian5 q ian5
1286
+ qiang1 q iang1
1287
+ qiang2 q iang2
1288
+ qiang3 q iang3
1289
+ qiang4 q iang4
1290
+ qiang5 q iang5
1291
+ qiao1 q iao1
1292
+ qiao2 q iao2
1293
+ qiao3 q iao3
1294
+ qiao4 q iao4
1295
+ qiao5 q iao5
1296
+ qie1 q ie1
1297
+ qie2 q ie2
1298
+ qie3 q ie3
1299
+ qie4 q ie4
1300
+ qie5 q ie5
1301
+ qin1 q in1
1302
+ qin2 q in2
1303
+ qin3 q in3
1304
+ qin4 q in4
1305
+ qin5 q in5
1306
+ qing1 q ing1
1307
+ qing2 q ing2
1308
+ qing3 q ing3
1309
+ qing4 q ing4
1310
+ qing5 q ing5
1311
+ qiong1 q iong1
1312
+ qiong2 q iong2
1313
+ qiong3 q iong3
1314
+ qiong4 q iong4
1315
+ qiong5 q iong5
1316
+ qiu1 q iou1
1317
+ qiu2 q iou2
1318
+ qiu3 q iou3
1319
+ qiu4 q iou4
1320
+ qiu5 q iou5
1321
+ qu1 q v1
1322
+ qu2 q v2
1323
+ qu3 q v3
1324
+ qu4 q v4
1325
+ qu5 q v5
1326
+ quan1 q van1
1327
+ quan2 q van2
1328
+ quan3 q van3
1329
+ quan4 q van4
1330
+ quan5 q van5
1331
+ que1 q ve1
1332
+ que2 q ve2
1333
+ que3 q ve3
1334
+ que4 q ve4
1335
+ que5 q ve5
1336
+ qun1 q vn1
1337
+ qun2 q vn2
1338
+ qun3 q vn3
1339
+ qun4 q vn4
1340
+ qun5 q vn5
1341
+ ran1 r an1
1342
+ ran2 r an2
1343
+ ran3 r an3
1344
+ ran4 r an4
1345
+ ran5 r an5
1346
+ rang1 r ang1
1347
+ rang2 r ang2
1348
+ rang3 r ang3
1349
+ rang4 r ang4
1350
+ rang5 r ang5
1351
+ rao1 r ao1
1352
+ rao2 r ao2
1353
+ rao3 r ao3
1354
+ rao4 r ao4
1355
+ rao5 r ao5
1356
+ re1 r e1
1357
+ re2 r e2
1358
+ re3 r e3
1359
+ re4 r e4
1360
+ re5 r e5
1361
+ ren1 r en1
1362
+ ren2 r en2
1363
+ ren3 r en3
1364
+ ren4 r en4
1365
+ ren5 r en5
1366
+ reng1 r eng1
1367
+ reng2 r eng2
1368
+ reng3 r eng3
1369
+ reng4 r eng4
1370
+ reng5 r eng5
1371
+ ri1 r iii1
1372
+ ri2 r iii2
1373
+ ri3 r iii3
1374
+ ri4 r iii4
1375
+ ri5 r iii5
1376
+ rong1 r ong1
1377
+ rong2 r ong2
1378
+ rong3 r ong3
1379
+ rong4 r ong4
1380
+ rong5 r ong5
1381
+ rou1 r ou1
1382
+ rou2 r ou2
1383
+ rou3 r ou3
1384
+ rou4 r ou4
1385
+ rou5 r ou5
1386
+ ru1 r u1
1387
+ ru2 r u2
1388
+ ru3 r u3
1389
+ ru4 r u4
1390
+ ru5 r u5
1391
+ rua1 r ua1
1392
+ rua2 r ua2
1393
+ rua3 r ua3
1394
+ rua4 r ua4
1395
+ rua5 r ua5
1396
+ ruan1 r uan1
1397
+ ruan2 r uan2
1398
+ ruan3 r uan3
1399
+ ruan4 r uan4
1400
+ ruan5 r uan5
1401
+ rui1 r uei1
1402
+ rui2 r uei2
1403
+ rui3 r uei3
1404
+ rui4 r uei4
1405
+ rui5 r uei5
1406
+ run1 r uen1
1407
+ run2 r uen2
1408
+ run3 r uen3
1409
+ run4 r uen4
1410
+ run5 r uen5
1411
+ ruo1 r uo1
1412
+ ruo2 r uo2
1413
+ ruo3 r uo3
1414
+ ruo4 r uo4
1415
+ ruo5 r uo5
1416
+ sa1 s a1
1417
+ sa2 s a2
1418
+ sa3 s a3
1419
+ sa4 s a4
1420
+ sa5 s a5
1421
+ sai1 s ai1
1422
+ sai2 s ai2
1423
+ sai3 s ai3
1424
+ sai4 s ai4
1425
+ sai5 s ai5
1426
+ san1 s an1
1427
+ san2 s an2
1428
+ san3 s an3
1429
+ san4 s an4
1430
+ san5 s an5
1431
+ sang1 s ang1
1432
+ sang2 s ang2
1433
+ sang3 s ang3
1434
+ sang4 s ang4
1435
+ sang5 s ang5
1436
+ sao1 s ao1
1437
+ sao2 s ao2
1438
+ sao3 s ao3
1439
+ sao4 s ao4
1440
+ sao5 s ao5
1441
+ se1 s e1
1442
+ se2 s e2
1443
+ se3 s e3
1444
+ se4 s e4
1445
+ se5 s e5
1446
+ sen1 s en1
1447
+ sen2 s en2
1448
+ sen3 s en3
1449
+ sen4 s en4
1450
+ sen5 s en5
1451
+ seng1 s eng1
1452
+ seng2 s eng2
1453
+ seng3 s eng3
1454
+ seng4 s eng4
1455
+ seng5 s eng5
1456
+ sha1 sh a1
1457
+ sha2 sh a2
1458
+ sha3 sh a3
1459
+ sha4 sh a4
1460
+ sha5 sh a5
1461
+ shai1 sh ai1
1462
+ shai2 sh ai2
1463
+ shai3 sh ai3
1464
+ shai4 sh ai4
1465
+ shai5 sh ai5
1466
+ shan1 sh an1
1467
+ shan2 sh an2
1468
+ shan3 sh an3
1469
+ shan4 sh an4
1470
+ shan5 sh an5
1471
+ shang1 sh ang1
1472
+ shang2 sh ang2
1473
+ shang3 sh ang3
1474
+ shang4 sh ang4
1475
+ shang5 sh ang5
1476
+ shao1 sh ao1
1477
+ shao2 sh ao2
1478
+ shao3 sh ao3
1479
+ shao4 sh ao4
1480
+ shao5 sh ao5
1481
+ she1 sh e1
1482
+ she2 sh e2
1483
+ she3 sh e3
1484
+ she4 sh e4
1485
+ she5 sh e5
1486
+ shei1 sh ei1
1487
+ shei2 sh ei2
1488
+ shei3 sh ei3
1489
+ shei4 sh ei4
1490
+ shei5 sh ei5
1491
+ shen1 sh en1
1492
+ shen2 sh en2
1493
+ shen3 sh en3
1494
+ shen4 sh en4
1495
+ shen5 sh en5
1496
+ sheng1 sh eng1
1497
+ sheng2 sh eng2
1498
+ sheng3 sh eng3
1499
+ sheng4 sh eng4
1500
+ sheng5 sh eng5
1501
+ shi1 sh iii1
1502
+ shi2 sh iii2
1503
+ shi3 sh iii3
1504
+ shi4 sh iii4
1505
+ shi5 sh iii5
1506
+ shou1 sh ou1
1507
+ shou2 sh ou2
1508
+ shou3 sh ou3
1509
+ shou4 sh ou4
1510
+ shou5 sh ou5
1511
+ shu1 sh u1
1512
+ shu2 sh u2
1513
+ shu3 sh u3
1514
+ shu4 sh u4
1515
+ shu5 sh u5
1516
+ shua1 sh ua1
1517
+ shua2 sh ua2
1518
+ shua3 sh ua3
1519
+ shua4 sh ua4
1520
+ shua5 sh ua5
1521
+ shuai1 sh uai1
1522
+ shuai2 sh uai2
1523
+ shuai3 sh uai3
1524
+ shuai4 sh uai4
1525
+ shuai5 sh uai5
1526
+ shuan1 sh uan1
1527
+ shuan2 sh uan2
1528
+ shuan3 sh uan3
1529
+ shuan4 sh uan4
1530
+ shuan5 sh uan5
1531
+ shuang1 sh uang1
1532
+ shuang2 sh uang2
1533
+ shuang3 sh uang3
1534
+ shuang4 sh uang4
1535
+ shuang5 sh uang5
1536
+ shui1 sh uei1
1537
+ shui2 sh uei2
1538
+ shui3 sh uei3
1539
+ shui4 sh uei4
1540
+ shui5 sh uei5
1541
+ shun1 sh uen1
1542
+ shun2 sh uen2
1543
+ shun3 sh uen3
1544
+ shun4 sh uen4
1545
+ shun5 sh uen5
1546
+ shuo1 sh uo1
1547
+ shuo2 sh uo2
1548
+ shuo3 sh uo3
1549
+ shuo4 sh uo4
1550
+ shuo5 sh uo5
1551
+ si1 s ii1
1552
+ si2 s ii2
1553
+ si3 s ii3
1554
+ si4 s ii4
1555
+ si5 s ii5
1556
+ song1 s ong1
1557
+ song2 s ong2
1558
+ song3 s ong3
1559
+ song4 s ong4
1560
+ song5 s ong5
1561
+ sou1 s ou1
1562
+ sou2 s ou2
1563
+ sou3 s ou3
1564
+ sou4 s ou4
1565
+ sou5 s ou5
1566
+ su1 s u1
1567
+ su2 s u2
1568
+ su3 s u3
1569
+ su4 s u4
1570
+ su5 s u5
1571
+ suan1 s uan1
1572
+ suan2 s uan2
1573
+ suan3 s uan3
1574
+ suan4 s uan4
1575
+ suan5 s uan5
1576
+ sui1 s uei1
1577
+ sui2 s uei2
1578
+ sui3 s uei3
1579
+ sui4 s uei4
1580
+ sui5 s uei5
1581
+ sun1 s uen1
1582
+ sun2 s uen2
1583
+ sun3 s uen3
1584
+ sun4 s uen4
1585
+ sun5 s uen5
1586
+ suo1 s uo1
1587
+ suo2 s uo2
1588
+ suo3 s uo3
1589
+ suo4 s uo4
1590
+ suo5 s uo5
1591
+ ta1 t a1
1592
+ ta2 t a2
1593
+ ta3 t a3
1594
+ ta4 t a4
1595
+ ta5 t a5
1596
+ tai1 t ai1
1597
+ tai2 t ai2
1598
+ tai3 t ai3
1599
+ tai4 t ai4
1600
+ tai5 t ai5
1601
+ tan1 t an1
1602
+ tan2 t an2
1603
+ tan3 t an3
1604
+ tan4 t an4
1605
+ tan5 t an5
1606
+ tang1 t ang1
1607
+ tang2 t ang2
1608
+ tang3 t ang3
1609
+ tang4 t ang4
1610
+ tang5 t ang5
1611
+ tao1 t ao1
1612
+ tao2 t ao2
1613
+ tao3 t ao3
1614
+ tao4 t ao4
1615
+ tao5 t ao5
1616
+ te1 t e1
1617
+ te2 t e2
1618
+ te3 t e3
1619
+ te4 t e4
1620
+ te5 t e5
1621
+ tei1 t ei1
1622
+ tei2 t ei2
1623
+ tei3 t ei3
1624
+ tei4 t ei4
1625
+ tei5 t ei5
1626
+ teng1 t eng1
1627
+ teng2 t eng2
1628
+ teng3 t eng3
1629
+ teng4 t eng4
1630
+ teng5 t eng5
1631
+ ti1 t i1
1632
+ ti2 t i2
1633
+ ti3 t i3
1634
+ ti4 t i4
1635
+ ti5 t i5
1636
+ tian1 t ian1
1637
+ tian2 t ian2
1638
+ tian3 t ian3
1639
+ tian4 t ian4
1640
+ tian5 t ian5
1641
+ tiao1 t iao1
1642
+ tiao2 t iao2
1643
+ tiao3 t iao3
1644
+ tiao4 t iao4
1645
+ tiao5 t iao5
1646
+ tie1 t ie1
1647
+ tie2 t ie2
1648
+ tie3 t ie3
1649
+ tie4 t ie4
1650
+ tie5 t ie5
1651
+ ting1 t ing1
1652
+ ting2 t ing2
1653
+ ting3 t ing3
1654
+ ting4 t ing4
1655
+ ting5 t ing5
1656
+ tong1 t ong1
1657
+ tong2 t ong2
1658
+ tong3 t ong3
1659
+ tong4 t ong4
1660
+ tong5 t ong5
1661
+ tou1 t ou1
1662
+ tou2 t ou2
1663
+ tou3 t ou3
1664
+ tou4 t ou4
1665
+ tou5 t ou5
1666
+ tu1 t u1
1667
+ tu2 t u2
1668
+ tu3 t u3
1669
+ tu4 t u4
1670
+ tu5 t u5
1671
+ tuan1 t uan1
1672
+ tuan2 t uan2
1673
+ tuan3 t uan3
1674
+ tuan4 t uan4
1675
+ tuan5 t uan5
1676
+ tui1 t uei1
1677
+ tui2 t uei2
1678
+ tui3 t uei3
1679
+ tui4 t uei4
1680
+ tui5 t uei5
1681
+ tun1 t uen1
1682
+ tun2 t uen2
1683
+ tun3 t uen3
1684
+ tun4 t uen4
1685
+ tun5 t uen5
1686
+ tuo1 t uo1
1687
+ tuo2 t uo2
1688
+ tuo3 t uo3
1689
+ tuo4 t uo4
1690
+ tuo5 t uo5
1691
+ wa1 w ua1
1692
+ wa2 w ua2
1693
+ wa3 w ua3
1694
+ wa4 w ua4
1695
+ wa5 w ua5
1696
+ wai1 w uai1
1697
+ wai2 w uai2
1698
+ wai3 w uai3
1699
+ wai4 w uai4
1700
+ wai5 w uai5
1701
+ wan1 w uan1
1702
+ wan2 w uan2
1703
+ wan3 w uan3
1704
+ wan4 w uan4
1705
+ wan5 w uan5
1706
+ wang1 w uang1
1707
+ wang2 w uang2
1708
+ wang3 w uang3
1709
+ wang4 w uang4
1710
+ wang5 w uang5
1711
+ wei1 w uei1
1712
+ wei2 w uei2
1713
+ wei3 w uei3
1714
+ wei4 w uei4
1715
+ wei5 w uei5
1716
+ wen1 w uen1
1717
+ wen2 w uen2
1718
+ wen3 w uen3
1719
+ wen4 w uen4
1720
+ wen5 w uen5
1721
+ weng1 w uen1
1722
+ weng2 w uen2
1723
+ weng3 w uen3
1724
+ weng4 w uen4
1725
+ weng5 w uen5
1726
+ wo1 w uo1
1727
+ wo2 w uo2
1728
+ wo3 w uo3
1729
+ wo4 w uo4
1730
+ wo5 w uo5
1731
+ wu1 w u1
1732
+ wu2 w u2
1733
+ wu3 w u3
1734
+ wu4 w u4
1735
+ wu5 w u5
1736
+ xi1 x i1
1737
+ xi2 x i2
1738
+ xi3 x i3
1739
+ xi4 x i4
1740
+ xi5 x i5
1741
+ xia1 x ia1
1742
+ xia2 x ia2
1743
+ xia3 x ia3
1744
+ xia4 x ia4
1745
+ xia5 x ia5
1746
+ xian1 x ian1
1747
+ xian2 x ian2
1748
+ xian3 x ian3
1749
+ xian4 x ian4
1750
+ xian5 x ian5
1751
+ xiang1 x iang1
1752
+ xiang2 x iang2
1753
+ xiang3 x iang3
1754
+ xiang4 x iang4
1755
+ xiang5 x iang5
1756
+ xiao1 x iao1
1757
+ xiao2 x iao2
1758
+ xiao3 x iao3
1759
+ xiao4 x iao4
1760
+ xiao5 x iao5
1761
+ xie1 x ie1
1762
+ xie2 x ie2
1763
+ xie3 x ie3
1764
+ xie4 x ie4
1765
+ xie5 x ie5
1766
+ xin1 x in1
1767
+ xin2 x in2
1768
+ xin3 x in3
1769
+ xin4 x in4
1770
+ xin5 x in5
1771
+ xing1 x ing1
1772
+ xing2 x ing2
1773
+ xing3 x ing3
1774
+ xing4 x ing4
1775
+ xing5 x ing5
1776
+ xiong1 x iong1
1777
+ xiong2 x iong2
1778
+ xiong3 x iong3
1779
+ xiong4 x iong4
1780
+ xiong5 x iong5
1781
+ xiu1 x iou1
1782
+ xiu2 x iou2
1783
+ xiu3 x iou3
1784
+ xiu4 x iou4
1785
+ xiu5 x iou5
1786
+ xu1 x v1
1787
+ xu2 x v2
1788
+ xu3 x v3
1789
+ xu4 x v4
1790
+ xu5 x v5
1791
+ xuan1 x van1
1792
+ xuan2 x van2
1793
+ xuan3 x van3
1794
+ xuan4 x van4
1795
+ xuan5 x van5
1796
+ xue1 x ve1
1797
+ xue2 x ve2
1798
+ xue3 x ve3
1799
+ xue4 x ve4
1800
+ xue5 x ve5
1801
+ xun1 x vn1
1802
+ xun2 x vn2
1803
+ xun3 x vn3
1804
+ xun4 x vn4
1805
+ xun5 x vn5
1806
+ ya1 y ia1
1807
+ ya2 y ia2
1808
+ ya3 y ia3
1809
+ ya4 y ia4
1810
+ ya5 y ia5
1811
+ yan1 y ian1
1812
+ yan2 y ian2
1813
+ yan3 y ian3
1814
+ yan4 y ian4
1815
+ yan5 y ian5
1816
+ yang1 y iang1
1817
+ yang2 y iang2
1818
+ yang3 y iang3
1819
+ yang4 y iang4
1820
+ yang5 y iang5
1821
+ yao1 y iao1
1822
+ yao2 y iao2
1823
+ yao3 y iao3
1824
+ yao4 y iao4
1825
+ yao5 y iao5
1826
+ ye1 y ie1
1827
+ ye2 y ie2
1828
+ ye3 y ie3
1829
+ ye4 y ie4
1830
+ ye5 y ie5
1831
+ yi1 y i1
1832
+ yi2 y i2
1833
+ yi3 y i3
1834
+ yi4 y i4
1835
+ yi5 y i5
1836
+ yin1 y in1
1837
+ yin2 y in2
1838
+ yin3 y in3
1839
+ yin4 y in4
1840
+ yin5 y in5
1841
+ ying1 y ing1
1842
+ ying2 y ing2
1843
+ ying3 y ing3
1844
+ ying4 y ing4
1845
+ ying5 y ing5
1846
+ yo1 y iou1
1847
+ yo2 y iou2
1848
+ yo3 y iou3
1849
+ yo4 y iou4
1850
+ yo5 y iou5
1851
+ yong1 y iong1
1852
+ yong2 y iong2
1853
+ yong3 y iong3
1854
+ yong4 y iong4
1855
+ yong5 y iong5
1856
+ you1 y iou1
1857
+ you2 y iou2
1858
+ you3 y iou3
1859
+ you4 y iou4
1860
+ you5 y iou5
1861
+ yu1 y v1
1862
+ yu2 y v2
1863
+ yu3 y v3
1864
+ yu4 y v4
1865
+ yu5 y v5
1866
+ yuan1 y van1
1867
+ yuan2 y van2
1868
+ yuan3 y van3
1869
+ yuan4 y van4
1870
+ yuan5 y van5
1871
+ yue1 y ve1
1872
+ yue2 y ve2
1873
+ yue3 y ve3
1874
+ yue4 y ve4
1875
+ yue5 y ve5
1876
+ yun1 y vn1
1877
+ yun2 y vn2
1878
+ yun3 y vn3
1879
+ yun4 y vn4
1880
+ yun5 y vn5
1881
+ za1 z a1
1882
+ za2 z a2
1883
+ za3 z a3
1884
+ za4 z a4
1885
+ za5 z a5
1886
+ zai1 z ai1
1887
+ zai2 z ai2
1888
+ zai3 z ai3
1889
+ zai4 z ai4
1890
+ zai5 z ai5
1891
+ zan1 z an1
1892
+ zan2 z an2
1893
+ zan3 z an3
1894
+ zan4 z an4
1895
+ zan5 z an5
1896
+ zang1 z ang1
1897
+ zang2 z ang2
1898
+ zang3 z ang3
1899
+ zang4 z ang4
1900
+ zang5 z ang5
1901
+ zao1 z ao1
1902
+ zao2 z ao2
1903
+ zao3 z ao3
1904
+ zao4 z ao4
1905
+ zao5 z ao5
1906
+ ze1 z e1
1907
+ ze2 z e2
1908
+ ze3 z e3
1909
+ ze4 z e4
1910
+ ze5 z e5
1911
+ zei1 z ei1
1912
+ zei2 z ei2
1913
+ zei3 z ei3
1914
+ zei4 z ei4
1915
+ zei5 z ei5
1916
+ zen1 z en1
1917
+ zen2 z en2
1918
+ zen3 z en3
1919
+ zen4 z en4
1920
+ zen5 z en5
1921
+ zeng1 z eng1
1922
+ zeng2 z eng2
1923
+ zeng3 z eng3
1924
+ zeng4 z eng4
1925
+ zeng5 z eng5
1926
+ zha1 zh a1
1927
+ zha2 zh a2
1928
+ zha3 zh a3
1929
+ zha4 zh a4
1930
+ zha5 zh a5
1931
+ zhai1 zh ai1
1932
+ zhai2 zh ai2
1933
+ zhai3 zh ai3
1934
+ zhai4 zh ai4
1935
+ zhai5 zh ai5
1936
+ zhan1 zh an1
1937
+ zhan2 zh an2
1938
+ zhan3 zh an3
1939
+ zhan4 zh an4
1940
+ zhan5 zh an5
1941
+ zhang1 zh ang1
1942
+ zhang2 zh ang2
1943
+ zhang3 zh ang3
1944
+ zhang4 zh ang4
1945
+ zhang5 zh ang5
1946
+ zhao1 zh ao1
1947
+ zhao2 zh ao2
1948
+ zhao3 zh ao3
1949
+ zhao4 zh ao4
1950
+ zhao5 zh ao5
1951
+ zhe1 zh e1
1952
+ zhe2 zh e2
1953
+ zhe3 zh e3
1954
+ zhe4 zh e4
1955
+ zhe5 zh e5
1956
+ zhei1 zh ei1
1957
+ zhei2 zh ei2
1958
+ zhei3 zh ei3
1959
+ zhei4 zh ei4
1960
+ zhei5 zh ei5
1961
+ zhen1 zh en1
1962
+ zhen2 zh en2
1963
+ zhen3 zh en3
1964
+ zhen4 zh en4
1965
+ zhen5 zh en5
1966
+ zheng1 zh eng1
1967
+ zheng2 zh eng2
1968
+ zheng3 zh eng3
1969
+ zheng4 zh eng4
1970
+ zheng5 zh eng5
1971
+ zhi1 zh iii1
1972
+ zhi2 zh iii2
1973
+ zhi3 zh iii3
1974
+ zhi4 zh iii4
1975
+ zhi5 zh iii5
1976
+ zhong1 zh ong1
1977
+ zhong2 zh ong2
1978
+ zhong3 zh ong3
1979
+ zhong4 zh ong4
1980
+ zhong5 zh ong5
1981
+ zhou1 zh ou1
1982
+ zhou2 zh ou2
1983
+ zhou3 zh ou3
1984
+ zhou4 zh ou4
1985
+ zhou5 zh ou5
1986
+ zhu1 zh u1
1987
+ zhu2 zh u2
1988
+ zhu3 zh u3
1989
+ zhu4 zh u4
1990
+ zhu5 zh u5
1991
+ zhua1 zh ua1
1992
+ zhua2 zh ua2
1993
+ zhua3 zh ua3
1994
+ zhua4 zh ua4
1995
+ zhua5 zh ua5
1996
+ zhuai1 zh uai1
1997
+ zhuai2 zh uai2
1998
+ zhuai3 zh uai3
1999
+ zhuai4 zh uai4
2000
+ zhuai5 zh uai5
2001
+ zhuan1 zh uan1
2002
+ zhuan2 zh uan2
2003
+ zhuan3 zh uan3
2004
+ zhuan4 zh uan4
2005
+ zhuan5 zh uan5
2006
+ zhuang1 zh uang1
2007
+ zhuang2 zh uang2
2008
+ zhuang3 zh uang3
2009
+ zhuang4 zh uang4
2010
+ zhuang5 zh uang5
2011
+ zhui1 zh uei1
2012
+ zhui2 zh uei2
2013
+ zhui3 zh uei3
2014
+ zhui4 zh uei4
2015
+ zhui5 zh uei5
2016
+ zhun1 zh uen1
2017
+ zhun2 zh uen2
2018
+ zhun3 zh uen3
2019
+ zhun4 zh uen4
2020
+ zhun5 zh uen5
2021
+ zhuo1 zh uo1
2022
+ zhuo2 zh uo2
2023
+ zhuo3 zh uo3
2024
+ zhuo4 zh uo4
2025
+ zhuo5 zh uo5
2026
+ zi1 z ii1
2027
+ zi2 z ii2
2028
+ zi3 z ii3
2029
+ zi4 z ii4
2030
+ zi5 z ii5
2031
+ zong1 z ong1
2032
+ zong2 z ong2
2033
+ zong3 z ong3
2034
+ zong4 z ong4
2035
+ zong5 z ong5
2036
+ zou1 z ou1
2037
+ zou2 z ou2
2038
+ zou3 z ou3
2039
+ zou4 z ou4
2040
+ zou5 z ou5
2041
+ zu1 z u1
2042
+ zu2 z u2
2043
+ zu3 z u3
2044
+ zu4 z u4
2045
+ zu5 z u5
2046
+ zuan1 z uan1
2047
+ zuan2 z uan2
2048
+ zuan3 z uan3
2049
+ zuan4 z uan4
2050
+ zuan5 z uan5
2051
+ zui1 z uei1
2052
+ zui2 z uei2
2053
+ zui3 z uei3
2054
+ zui4 z uei4
2055
+ zui5 z uei5
2056
+ zun1 z uen1
2057
+ zun2 z uen2
2058
+ zun3 z uen3
2059
+ zun4 z uen4
2060
+ zun5 z uen5
2061
+ zuo1 z uo1
2062
+ zuo2 z uo2
2063
+ zuo3 z uo3
2064
+ zuo4 z uo4
2065
+ zuo5 z uo5
2066
+ ar1 a1 rr
2067
+ ar2 a2 rr
2068
+ ar3 a3 rr
2069
+ ar4 a4 rr
2070
+ ar5 a5 rr
2071
+ air1 ai1 rr
2072
+ air2 ai2 rr
2073
+ air3 ai3 rr
2074
+ air4 ai4 rr
2075
+ air5 ai5 rr
2076
+ anr1 an1 rr
2077
+ anr2 an2 rr
2078
+ anr3 an3 rr
2079
+ anr4 an4 rr
2080
+ anr5 an5 rr
2081
+ angr1 ang1 rr
2082
+ angr2 ang2 rr
2083
+ angr3 ang3 rr
2084
+ angr4 ang4 rr
2085
+ angr5 ang5 rr
2086
+ aor1 ao1 rr
2087
+ aor2 ao2 rr
2088
+ aor3 ao3 rr
2089
+ aor4 ao4 rr
2090
+ aor5 ao5 rr
2091
+ bar1 b a1 rr
2092
+ bar2 b a2 rr
2093
+ bar3 b a3 rr
2094
+ bar4 b a4 rr
2095
+ bar5 b a5 rr
2096
+ bair1 b ai1 rr
2097
+ bair2 b ai2 rr
2098
+ bair3 b ai3 rr
2099
+ bair4 b ai4 rr
2100
+ bair5 b ai5 rr
2101
+ banr1 b an1 rr
2102
+ banr2 b an2 rr
2103
+ banr3 b an3 rr
2104
+ banr4 b an4 rr
2105
+ banr5 b an5 rr
2106
+ bangr1 b ang1 rr
2107
+ bangr2 b ang2 rr
2108
+ bangr3 b ang3 rr
2109
+ bangr4 b ang4 rr
2110
+ bangr5 b ang5 rr
2111
+ baor1 b ao1 rr
2112
+ baor2 b ao2 rr
2113
+ baor3 b ao3 rr
2114
+ baor4 b ao4 rr
2115
+ baor5 b ao5 rr
2116
+ beir1 b ei1 rr
2117
+ beir2 b ei2 rr
2118
+ beir3 b ei3 rr
2119
+ beir4 b ei4 rr
2120
+ beir5 b ei5 rr
2121
+ benr1 b en1 rr
2122
+ benr2 b en2 rr
2123
+ benr3 b en3 rr
2124
+ benr4 b en4 rr
2125
+ benr5 b en5 rr
2126
+ bengr1 b eng1 rr
2127
+ bengr2 b eng2 rr
2128
+ bengr3 b eng3 rr
2129
+ bengr4 b eng4 rr
2130
+ bengr5 b eng5 rr
2131
+ bir1 b i1 rr
2132
+ bir2 b i2 rr
2133
+ bir3 b i3 rr
2134
+ bir4 b i4 rr
2135
+ bir5 b i5 rr
2136
+ bianr1 b ian1 rr
2137
+ bianr2 b ian2 rr
2138
+ bianr3 b ian3 rr
2139
+ bianr4 b ian4 rr
2140
+ bianr5 b ian5 rr
2141
+ biaor1 b iao1 rr
2142
+ biaor2 b iao2 rr
2143
+ biaor3 b iao3 rr
2144
+ biaor4 b iao4 rr
2145
+ biaor5 b iao5 rr
2146
+ bier1 b ie1 rr
2147
+ bier2 b ie2 rr
2148
+ bier3 b ie3 rr
2149
+ bier4 b ie4 rr
2150
+ bier5 b ie5 rr
2151
+ binr1 b in1 rr
2152
+ binr2 b in2 rr
2153
+ binr3 b in3 rr
2154
+ binr4 b in4 rr
2155
+ binr5 b in5 rr
2156
+ bingr1 b ing1 rr
2157
+ bingr2 b ing2 rr
2158
+ bingr3 b ing3 rr
2159
+ bingr4 b ing4 rr
2160
+ bingr5 b ing5 rr
2161
+ bor1 b o1 rr
2162
+ bor2 b o2 rr
2163
+ bor3 b o3 rr
2164
+ bor4 b o4 rr
2165
+ bor5 b o5 rr
2166
+ bur1 b u1 rr
2167
+ bur2 b u2 rr
2168
+ bur3 b u3 rr
2169
+ bur4 b u4 rr
2170
+ bur5 b u5 rr
2171
+ car1 c a1 rr
2172
+ car2 c a2 rr
2173
+ car3 c a3 rr
2174
+ car4 c a4 rr
2175
+ car5 c a5 rr
2176
+ cair1 c ai1 rr
2177
+ cair2 c ai2 rr
2178
+ cair3 c ai3 rr
2179
+ cair4 c ai4 rr
2180
+ cair5 c ai5 rr
2181
+ canr1 c an1 rr
2182
+ canr2 c an2 rr
2183
+ canr3 c an3 rr
2184
+ canr4 c an4 rr
2185
+ canr5 c an5 rr
2186
+ cangr1 c ang1 rr
2187
+ cangr2 c ang2 rr
2188
+ cangr3 c ang3 rr
2189
+ cangr4 c ang4 rr
2190
+ cangr5 c ang5 rr
2191
+ caor1 c ao1 rr
2192
+ caor2 c ao2 rr
2193
+ caor3 c ao3 rr
2194
+ caor4 c ao4 rr
2195
+ caor5 c ao5 rr
2196
+ cer1 c e1 rr
2197
+ cer2 c e2 rr
2198
+ cer3 c e3 rr
2199
+ cer4 c e4 rr
2200
+ cer5 c e5 rr
2201
+ cenr1 c en1 rr
2202
+ cenr2 c en2 rr
2203
+ cenr3 c en3 rr
2204
+ cenr4 c en4 rr
2205
+ cenr5 c en5 rr
2206
+ cengr1 c eng1 rr
2207
+ cengr2 c eng2 rr
2208
+ cengr3 c eng3 rr
2209
+ cengr4 c eng4 rr
2210
+ cengr5 c eng5 rr
2211
+ char1 ch a1 rr
2212
+ char2 ch a2 rr
2213
+ char3 ch a3 rr
2214
+ char4 ch a4 rr
2215
+ char5 ch a5 rr
2216
+ chair1 ch ai1 rr
2217
+ chair2 ch ai2 rr
2218
+ chair3 ch ai3 rr
2219
+ chair4 ch ai4 rr
2220
+ chair5 ch ai5 rr
2221
+ chanr1 ch an1 rr
2222
+ chanr2 ch an2 rr
2223
+ chanr3 ch an3 rr
2224
+ chanr4 ch an4 rr
2225
+ chanr5 ch an5 rr
2226
+ changr1 ch ang1 rr
2227
+ changr2 ch ang2 rr
2228
+ changr3 ch ang3 rr
2229
+ changr4 ch ang4 rr
2230
+ changr5 ch ang5 rr
2231
+ chaor1 ch ao1 rr
2232
+ chaor2 ch ao2 rr
2233
+ chaor3 ch ao3 rr
2234
+ chaor4 ch ao4 rr
2235
+ chaor5 ch ao5 rr
2236
+ cher1 ch e1 rr
2237
+ cher2 ch e2 rr
2238
+ cher3 ch e3 rr
2239
+ cher4 ch e4 rr
2240
+ cher5 ch e5 rr
2241
+ chenr1 ch en1 rr
2242
+ chenr2 ch en2 rr
2243
+ chenr3 ch en3 rr
2244
+ chenr4 ch en4 rr
2245
+ chenr5 ch en5 rr
2246
+ chengr1 ch eng1 rr
2247
+ chengr2 ch eng2 rr
2248
+ chengr3 ch eng3 rr
2249
+ chengr4 ch eng4 rr
2250
+ chengr5 ch eng5 rr
2251
+ chir1 ch iii1 rr
2252
+ chir2 ch iii2 rr
2253
+ chir3 ch iii3 rr
2254
+ chir4 ch iii4 rr
2255
+ chir5 ch iii5 rr
2256
+ chongr1 ch ong1 rr
2257
+ chongr2 ch ong2 rr
2258
+ chongr3 ch ong3 rr
2259
+ chongr4 ch ong4 rr
2260
+ chongr5 ch ong5 rr
2261
+ chour1 ch ou1 rr
2262
+ chour2 ch ou2 rr
2263
+ chour3 ch ou3 rr
2264
+ chour4 ch ou4 rr
2265
+ chour5 ch ou5 rr
2266
+ chur1 ch u1 rr
2267
+ chur2 ch u2 rr
2268
+ chur3 ch u3 rr
2269
+ chur4 ch u4 rr
2270
+ chur5 ch u5 rr
2271
+ chuair1 ch uai1 rr
2272
+ chuair2 ch uai2 rr
2273
+ chuair3 ch uai3 rr
2274
+ chuair4 ch uai4 rr
2275
+ chuair5 ch uai5 rr
2276
+ chuanr1 ch uan1 rr
2277
+ chuanr2 ch uan2 rr
2278
+ chuanr3 ch uan3 rr
2279
+ chuanr4 ch uan4 rr
2280
+ chuanr5 ch uan5 rr
2281
+ chuangr1 ch uang1 rr
2282
+ chuangr2 ch uang2 rr
2283
+ chuangr3 ch uang3 rr
2284
+ chuangr4 ch uang4 rr
2285
+ chuangr5 ch uang5 rr
2286
+ chuir1 ch uei1 rr
2287
+ chuir2 ch uei2 rr
2288
+ chuir3 ch uei3 rr
2289
+ chuir4 ch uei4 rr
2290
+ chuir5 ch uei5 rr
2291
+ chunr1 ch uen1 rr
2292
+ chunr2 ch uen2 rr
2293
+ chunr3 ch uen3 rr
2294
+ chunr4 ch uen4 rr
2295
+ chunr5 ch uen5 rr
2296
+ chuor1 ch uo1 rr
2297
+ chuor2 ch uo2 rr
2298
+ chuor3 ch uo3 rr
2299
+ chuor4 ch uo4 rr
2300
+ chuor5 ch uo5 rr
2301
+ cir1 c ii1 rr
2302
+ cir2 c ii2 rr
2303
+ cir3 c ii3 rr
2304
+ cir4 c ii4 rr
2305
+ cir5 c ii5 rr
2306
+ congr1 c ong1 rr
2307
+ congr2 c ong2 rr
2308
+ congr3 c ong3 rr
2309
+ congr4 c ong4 rr
2310
+ congr5 c ong5 rr
2311
+ cour1 c ou1 rr
2312
+ cour2 c ou2 rr
2313
+ cour3 c ou3 rr
2314
+ cour4 c ou4 rr
2315
+ cour5 c ou5 rr
2316
+ cur1 c u1 rr
2317
+ cur2 c u2 rr
2318
+ cur3 c u3 rr
2319
+ cur4 c u4 rr
2320
+ cur5 c u5 rr
2321
+ cuanr1 c uan1 rr
2322
+ cuanr2 c uan2 rr
2323
+ cuanr3 c uan3 rr
2324
+ cuanr4 c uan4 rr
2325
+ cuanr5 c uan5 rr
2326
+ cuir1 c uei1 rr
2327
+ cuir2 c uei2 rr
2328
+ cuir3 c uei3 rr
2329
+ cuir4 c uei4 rr
2330
+ cuir5 c uei5 rr
2331
+ cunr1 c uen1 rr
2332
+ cunr2 c uen2 rr
2333
+ cunr3 c uen3 rr
2334
+ cunr4 c uen4 rr
2335
+ cunr5 c uen5 rr
2336
+ cuor1 c uo1 rr
2337
+ cuor2 c uo2 rr
2338
+ cuor3 c uo3 rr
2339
+ cuor4 c uo4 rr
2340
+ cuor5 c uo5 rr
2341
+ dar1 d a1 rr
2342
+ dar2 d a2 rr
2343
+ dar3 d a3 rr
2344
+ dar4 d a4 rr
2345
+ dar5 d a5 rr
2346
+ dair1 d ai1 rr
2347
+ dair2 d ai2 rr
2348
+ dair3 d ai3 rr
2349
+ dair4 d ai4 rr
2350
+ dair5 d ai5 rr
2351
+ danr1 d an1 rr
2352
+ danr2 d an2 rr
2353
+ danr3 d an3 rr
2354
+ danr4 d an4 rr
2355
+ danr5 d an5 rr
2356
+ dangr1 d ang1 rr
2357
+ dangr2 d ang2 rr
2358
+ dangr3 d ang3 rr
2359
+ dangr4 d ang4 rr
2360
+ dangr5 d ang5 rr
2361
+ daor1 d ao1 rr
2362
+ daor2 d ao2 rr
2363
+ daor3 d ao3 rr
2364
+ daor4 d ao4 rr
2365
+ daor5 d ao5 rr
2366
+ der1 d e1 rr
2367
+ der2 d e2 rr
2368
+ der3 d e3 rr
2369
+ der4 d e4 rr
2370
+ der5 d e5 rr
2371
+ deir1 d ei1 rr
2372
+ deir2 d ei2 rr
2373
+ deir3 d ei3 rr
2374
+ deir4 d ei4 rr
2375
+ deir5 d ei5 rr
2376
+ denr1 d en1 rr
2377
+ denr2 d en2 rr
2378
+ denr3 d en3 rr
2379
+ denr4 d en4 rr
2380
+ denr5 d en5 rr
2381
+ dengr1 d eng1 rr
2382
+ dengr2 d eng2 rr
2383
+ dengr3 d eng3 rr
2384
+ dengr4 d eng4 rr
2385
+ dengr5 d eng5 rr
2386
+ dir1 d i1 rr
2387
+ dir2 d i2 rr
2388
+ dir3 d i3 rr
2389
+ dir4 d i4 rr
2390
+ dir5 d i5 rr
2391
+ diar1 d ia1 rr
2392
+ diar2 d ia2 rr
2393
+ diar3 d ia3 rr
2394
+ diar4 d ia4 rr
2395
+ diar5 d ia5 rr
2396
+ dianr1 d ian1 rr
2397
+ dianr2 d ian2 rr
2398
+ dianr3 d ian3 rr
2399
+ dianr4 d ian4 rr
2400
+ dianr5 d ian5 rr
2401
+ diaor1 d iao1 rr
2402
+ diaor2 d iao2 rr
2403
+ diaor3 d iao3 rr
2404
+ diaor4 d iao4 rr
2405
+ diaor5 d iao5 rr
2406
+ dier1 d ie1 rr
2407
+ dier2 d ie2 rr
2408
+ dier3 d ie3 rr
2409
+ dier4 d ie4 rr
2410
+ dier5 d ie5 rr
2411
+ dingr1 d ing1 rr
2412
+ dingr2 d ing2 rr
2413
+ dingr3 d ing3 rr
2414
+ dingr4 d ing4 rr
2415
+ dingr5 d ing5 rr
2416
+ diur1 d iou1 rr
2417
+ diur2 d iou2 rr
2418
+ diur3 d iou3 rr
2419
+ diur4 d iou4 rr
2420
+ diur5 d iou5 rr
2421
+ dongr1 d ong1 rr
2422
+ dongr2 d ong2 rr
2423
+ dongr3 d ong3 rr
2424
+ dongr4 d ong4 rr
2425
+ dongr5 d ong5 rr
2426
+ dour1 d ou1 rr
2427
+ dour2 d ou2 rr
2428
+ dour3 d ou3 rr
2429
+ dour4 d ou4 rr
2430
+ dour5 d ou5 rr
2431
+ dur1 d u1 rr
2432
+ dur2 d u2 rr
2433
+ dur3 d u3 rr
2434
+ dur4 d u4 rr
2435
+ dur5 d u5 rr
2436
+ duanr1 d uan1 rr
2437
+ duanr2 d uan2 rr
2438
+ duanr3 d uan3 rr
2439
+ duanr4 d uan4 rr
2440
+ duanr5 d uan5 rr
2441
+ duir1 d uei1 rr
2442
+ duir2 d uei2 rr
2443
+ duir3 d uei3 rr
2444
+ duir4 d uei4 rr
2445
+ duir5 d uei5 rr
2446
+ dunr1 d uen1 rr
2447
+ dunr2 d uen2 rr
2448
+ dunr3 d uen3 rr
2449
+ dunr4 d uen4 rr
2450
+ dunr5 d uen5 rr
2451
+ duor1 d uo1 rr
2452
+ duor2 d uo2 rr
2453
+ duor3 d uo3 rr
2454
+ duor4 d uo4 rr
2455
+ duor5 d uo5 rr
2456
+ er1 e1 rr
2457
+ er2 e2 rr
2458
+ er3 e3 rr
2459
+ er4 e4 rr
2460
+ er5 e5 rr
2461
+ eir1 ei1 rr
2462
+ eir2 ei2 rr
2463
+ eir3 ei3 rr
2464
+ eir4 ei4 rr
2465
+ eir5 ei5 rr
2466
+ enr1 en1 rr
2467
+ enr2 en2 rr
2468
+ enr3 en3 rr
2469
+ enr4 en4 rr
2470
+ enr5 en5 rr
2471
+ engr1 eng1 rr
2472
+ engr2 eng2 rr
2473
+ engr3 eng3 rr
2474
+ engr4 eng4 rr
2475
+ engr5 eng5 rr
2476
+ far1 f a1 rr
2477
+ far2 f a2 rr
2478
+ far3 f a3 rr
2479
+ far4 f a4 rr
2480
+ far5 f a5 rr
2481
+ fanr1 f an1 rr
2482
+ fanr2 f an2 rr
2483
+ fanr3 f an3 rr
2484
+ fanr4 f an4 rr
2485
+ fanr5 f an5 rr
2486
+ fangr1 f ang1 rr
2487
+ fangr2 f ang2 rr
2488
+ fangr3 f ang3 rr
2489
+ fangr4 f ang4 rr
2490
+ fangr5 f ang5 rr
2491
+ feir1 f ei1 rr
2492
+ feir2 f ei2 rr
2493
+ feir3 f ei3 rr
2494
+ feir4 f ei4 rr
2495
+ feir5 f ei5 rr
2496
+ fenr1 f en1 rr
2497
+ fenr2 f en2 rr
2498
+ fenr3 f en3 rr
2499
+ fenr4 f en4 rr
2500
+ fenr5 f en5 rr
2501
+ fengr1 f eng1 rr
2502
+ fengr2 f eng2 rr
2503
+ fengr3 f eng3 rr
2504
+ fengr4 f eng4 rr
2505
+ fengr5 f eng5 rr
2506
+ for1 f o1 rr
2507
+ for2 f o2 rr
2508
+ for3 f o3 rr
2509
+ for4 f o4 rr
2510
+ for5 f o5 rr
2511
+ four1 f ou1 rr
2512
+ four2 f ou2 rr
2513
+ four3 f ou3 rr
2514
+ four4 f ou4 rr
2515
+ four5 f ou5 rr
2516
+ fur1 f u1 rr
2517
+ fur2 f u2 rr
2518
+ fur3 f u3 rr
2519
+ fur4 f u4 rr
2520
+ fur5 f u5 rr
2521
+ gar1 g a1 rr
2522
+ gar2 g a2 rr
2523
+ gar3 g a3 rr
2524
+ gar4 g a4 rr
2525
+ gar5 g a5 rr
2526
+ gair1 g ai1 rr
2527
+ gair2 g ai2 rr
2528
+ gair3 g ai3 rr
2529
+ gair4 g ai4 rr
2530
+ gair5 g ai5 rr
2531
+ ganr1 g an1 rr
2532
+ ganr2 g an2 rr
2533
+ ganr3 g an3 rr
2534
+ ganr4 g an4 rr
2535
+ ganr5 g an5 rr
2536
+ gangr1 g ang1 rr
2537
+ gangr2 g ang2 rr
2538
+ gangr3 g ang3 rr
2539
+ gangr4 g ang4 rr
2540
+ gangr5 g ang5 rr
2541
+ gaor1 g ao1 rr
2542
+ gaor2 g ao2 rr
2543
+ gaor3 g ao3 rr
2544
+ gaor4 g ao4 rr
2545
+ gaor5 g ao5 rr
2546
+ ger1 g e1 rr
2547
+ ger2 g e2 rr
2548
+ ger3 g e3 rr
2549
+ ger4 g e4 rr
2550
+ ger5 g e5 rr
2551
+ geir1 g ei1 rr
2552
+ geir2 g ei2 rr
2553
+ geir3 g ei3 rr
2554
+ geir4 g ei4 rr
2555
+ geir5 g ei5 rr
2556
+ genr1 g en1 rr
2557
+ genr2 g en2 rr
2558
+ genr3 g en3 rr
2559
+ genr4 g en4 rr
2560
+ genr5 g en5 rr
2561
+ gengr1 g eng1 rr
2562
+ gengr2 g eng2 rr
2563
+ gengr3 g eng3 rr
2564
+ gengr4 g eng4 rr
2565
+ gengr5 g eng5 rr
2566
+ gongr1 g ong1 rr
2567
+ gongr2 g ong2 rr
2568
+ gongr3 g ong3 rr
2569
+ gongr4 g ong4 rr
2570
+ gongr5 g ong5 rr
2571
+ gour1 g ou1 rr
2572
+ gour2 g ou2 rr
2573
+ gour3 g ou3 rr
2574
+ gour4 g ou4 rr
2575
+ gour5 g ou5 rr
2576
+ gur1 g u1 rr
2577
+ gur2 g u2 rr
2578
+ gur3 g u3 rr
2579
+ gur4 g u4 rr
2580
+ gur5 g u5 rr
2581
+ guar1 g ua1 rr
2582
+ guar2 g ua2 rr
2583
+ guar3 g ua3 rr
2584
+ guar4 g ua4 rr
2585
+ guar5 g ua5 rr
2586
+ guair1 g uai1 rr
2587
+ guair2 g uai2 rr
2588
+ guair3 g uai3 rr
2589
+ guair4 g uai4 rr
2590
+ guair5 g uai5 rr
2591
+ guanr1 g uan1 rr
2592
+ guanr2 g uan2 rr
2593
+ guanr3 g uan3 rr
2594
+ guanr4 g uan4 rr
2595
+ guanr5 g uan5 rr
2596
+ guangr1 g uang1 rr
2597
+ guangr2 g uang2 rr
2598
+ guangr3 g uang3 rr
2599
+ guangr4 g uang4 rr
2600
+ guangr5 g uang5 rr
2601
+ guir1 g uei1 rr
2602
+ guir2 g uei2 rr
2603
+ guir3 g uei3 rr
2604
+ guir4 g uei4 rr
2605
+ guir5 g uei5 rr
2606
+ gunr1 g uen1 rr
2607
+ gunr2 g uen2 rr
2608
+ gunr3 g uen3 rr
2609
+ gunr4 g uen4 rr
2610
+ gunr5 g uen5 rr
2611
+ guor1 g uo1 rr
2612
+ guor2 g uo2 rr
2613
+ guor3 g uo3 rr
2614
+ guor4 g uo4 rr
2615
+ guor5 g uo5 rr
2616
+ har1 h a1 rr
2617
+ har2 h a2 rr
2618
+ har3 h a3 rr
2619
+ har4 h a4 rr
2620
+ har5 h a5 rr
2621
+ hair1 h ai1 rr
2622
+ hair2 h ai2 rr
2623
+ hair3 h ai3 rr
2624
+ hair4 h ai4 rr
2625
+ hair5 h ai5 rr
2626
+ hanr1 h an1 rr
2627
+ hanr2 h an2 rr
2628
+ hanr3 h an3 rr
2629
+ hanr4 h an4 rr
2630
+ hanr5 h an5 rr
2631
+ hangr1 h ang1 rr
2632
+ hangr2 h ang2 rr
2633
+ hangr3 h ang3 rr
2634
+ hangr4 h ang4 rr
2635
+ hangr5 h ang5 rr
2636
+ haor1 h ao1 rr
2637
+ haor2 h ao2 rr
2638
+ haor3 h ao3 rr
2639
+ haor4 h ao4 rr
2640
+ haor5 h ao5 rr
2641
+ her1 h e1 rr
2642
+ her2 h e2 rr
2643
+ her3 h e3 rr
2644
+ her4 h e4 rr
2645
+ her5 h e5 rr
2646
+ heir1 h ei1 rr
2647
+ heir2 h ei2 rr
2648
+ heir3 h ei3 rr
2649
+ heir4 h ei4 rr
2650
+ heir5 h ei5 rr
2651
+ henr1 h en1 rr
2652
+ henr2 h en2 rr
2653
+ henr3 h en3 rr
2654
+ henr4 h en4 rr
2655
+ henr5 h en5 rr
2656
+ hengr1 h eng1 rr
2657
+ hengr2 h eng2 rr
2658
+ hengr3 h eng3 rr
2659
+ hengr4 h eng4 rr
2660
+ hengr5 h eng5 rr
2661
+ hongr1 h ong1 rr
2662
+ hongr2 h ong2 rr
2663
+ hongr3 h ong3 rr
2664
+ hongr4 h ong4 rr
2665
+ hongr5 h ong5 rr
2666
+ hour1 h ou1 rr
2667
+ hour2 h ou2 rr
2668
+ hour3 h ou3 rr
2669
+ hour4 h ou4 rr
2670
+ hour5 h ou5 rr
2671
+ hur1 h u1 rr
2672
+ hur2 h u2 rr
2673
+ hur3 h u3 rr
2674
+ hur4 h u4 rr
2675
+ hur5 h u5 rr
2676
+ huar1 h ua1 rr
2677
+ huar2 h ua2 rr
2678
+ huar3 h ua3 rr
2679
+ huar4 h ua4 rr
2680
+ huar5 h ua5 rr
2681
+ huair1 h uai1 rr
2682
+ huair2 h uai2 rr
2683
+ huair3 h uai3 rr
2684
+ huair4 h uai4 rr
2685
+ huair5 h uai5 rr
2686
+ huanr1 h uan1 rr
2687
+ huanr2 h uan2 rr
2688
+ huanr3 h uan3 rr
2689
+ huanr4 h uan4 rr
2690
+ huanr5 h uan5 rr
2691
+ huangr1 h uang1 rr
2692
+ huangr2 h uang2 rr
2693
+ huangr3 h uang3 rr
2694
+ huangr4 h uang4 rr
2695
+ huangr5 h uang5 rr
2696
+ huir1 h uei1 rr
2697
+ huir2 h uei2 rr
2698
+ huir3 h uei3 rr
2699
+ huir4 h uei4 rr
2700
+ huir5 h uei5 rr
2701
+ hunr1 h uen1 rr
2702
+ hunr2 h uen2 rr
2703
+ hunr3 h uen3 rr
2704
+ hunr4 h uen4 rr
2705
+ hunr5 h uen5 rr
2706
+ huor1 h uo1 rr
2707
+ huor2 h uo2 rr
2708
+ huor3 h uo3 rr
2709
+ huor4 h uo4 rr
2710
+ huor5 h uo5 rr
2711
+ jir1 j i1 rr
2712
+ jir2 j i2 rr
2713
+ jir3 j i3 rr
2714
+ jir4 j i4 rr
2715
+ jir5 j i5 rr
2716
+ jiar1 j ia1 rr
2717
+ jiar2 j ia2 rr
2718
+ jiar3 j ia3 rr
2719
+ jiar4 j ia4 rr
2720
+ jiar5 j ia5 rr
2721
+ jianr1 j ian1 rr
2722
+ jianr2 j ian2 rr
2723
+ jianr3 j ian3 rr
2724
+ jianr4 j ian4 rr
2725
+ jianr5 j ian5 rr
2726
+ jiangr1 j iang1 rr
2727
+ jiangr2 j iang2 rr
2728
+ jiangr3 j iang3 rr
2729
+ jiangr4 j iang4 rr
2730
+ jiangr5 j iang5 rr
2731
+ jiaor1 j iao1 rr
2732
+ jiaor2 j iao2 rr
2733
+ jiaor3 j iao3 rr
2734
+ jiaor4 j iao4 rr
2735
+ jiaor5 j iao5 rr
2736
+ jier1 j ie1 rr
2737
+ jier2 j ie2 rr
2738
+ jier3 j ie3 rr
2739
+ jier4 j ie4 rr
2740
+ jier5 j ie5 rr
2741
+ jinr1 j in1 rr
2742
+ jinr2 j in2 rr
2743
+ jinr3 j in3 rr
2744
+ jinr4 j in4 rr
2745
+ jinr5 j in5 rr
2746
+ jingr1 j ing1 rr
2747
+ jingr2 j ing2 rr
2748
+ jingr3 j ing3 rr
2749
+ jingr4 j ing4 rr
2750
+ jingr5 j ing5 rr
2751
+ jiongr1 j iong1 rr
2752
+ jiongr2 j iong2 rr
2753
+ jiongr3 j iong3 rr
2754
+ jiongr4 j iong4 rr
2755
+ jiongr5 j iong5 rr
2756
+ jiur1 j iou1 rr
2757
+ jiur2 j iou2 rr
2758
+ jiur3 j iou3 rr
2759
+ jiur4 j iou4 rr
2760
+ jiur5 j iou5 rr
2761
+ jur1 j v1 rr
2762
+ jur2 j v2 rr
2763
+ jur3 j v3 rr
2764
+ jur4 j v4 rr
2765
+ jur5 j v5 rr
2766
+ juanr1 j van1 rr
2767
+ juanr2 j van2 rr
2768
+ juanr3 j van3 rr
2769
+ juanr4 j van4 rr
2770
+ juanr5 j van5 rr
2771
+ juer1 j ve1 rr
2772
+ juer2 j ve2 rr
2773
+ juer3 j ve3 rr
2774
+ juer4 j ve4 rr
2775
+ juer5 j ve5 rr
2776
+ junr1 j vn1 rr
2777
+ junr2 j vn2 rr
2778
+ junr3 j vn3 rr
2779
+ junr4 j vn4 rr
2780
+ junr5 j vn5 rr
2781
+ kar1 k a1 rr
2782
+ kar2 k a2 rr
2783
+ kar3 k a3 rr
2784
+ kar4 k a4 rr
2785
+ kar5 k a5 rr
2786
+ kair1 k ai1 rr
2787
+ kair2 k ai2 rr
2788
+ kair3 k ai3 rr
2789
+ kair4 k ai4 rr
2790
+ kair5 k ai5 rr
2791
+ kanr1 k an1 rr
2792
+ kanr2 k an2 rr
2793
+ kanr3 k an3 rr
2794
+ kanr4 k an4 rr
2795
+ kanr5 k an5 rr
2796
+ kangr1 k ang1 rr
2797
+ kangr2 k ang2 rr
2798
+ kangr3 k ang3 rr
2799
+ kangr4 k ang4 rr
2800
+ kangr5 k ang5 rr
2801
+ kaor1 k ao1 rr
2802
+ kaor2 k ao2 rr
2803
+ kaor3 k ao3 rr
2804
+ kaor4 k ao4 rr
2805
+ kaor5 k ao5 rr
2806
+ ker1 k e1 rr
2807
+ ker2 k e2 rr
2808
+ ker3 k e3 rr
2809
+ ker4 k e4 rr
2810
+ ker5 k e5 rr
2811
+ keir1 k ei1 rr
2812
+ keir2 k ei2 rr
2813
+ keir3 k ei3 rr
2814
+ keir4 k ei4 rr
2815
+ keir5 k ei5 rr
2816
+ kenr1 k en1 rr
2817
+ kenr2 k en2 rr
2818
+ kenr3 k en3 rr
2819
+ kenr4 k en4 rr
2820
+ kenr5 k en5 rr
2821
+ kengr1 k eng1 rr
2822
+ kengr2 k eng2 rr
2823
+ kengr3 k eng3 rr
2824
+ kengr4 k eng4 rr
2825
+ kengr5 k eng5 rr
2826
+ kongr1 k ong1 rr
2827
+ kongr2 k ong2 rr
2828
+ kongr3 k ong3 rr
2829
+ kongr4 k ong4 rr
2830
+ kongr5 k ong5 rr
2831
+ kour1 k ou1 rr
2832
+ kour2 k ou2 rr
2833
+ kour3 k ou3 rr
2834
+ kour4 k ou4 rr
2835
+ kour5 k ou5 rr
2836
+ kur1 k u1 rr
2837
+ kur2 k u2 rr
2838
+ kur3 k u3 rr
2839
+ kur4 k u4 rr
2840
+ kur5 k u5 rr
2841
+ kuar1 k ua1 rr
2842
+ kuar2 k ua2 rr
2843
+ kuar3 k ua3 rr
2844
+ kuar4 k ua4 rr
2845
+ kuar5 k ua5 rr
2846
+ kuair1 k uai1 rr
2847
+ kuair2 k uai2 rr
2848
+ kuair3 k uai3 rr
2849
+ kuair4 k uai4 rr
2850
+ kuair5 k uai5 rr
2851
+ kuanr1 k uan1 rr
2852
+ kuanr2 k uan2 rr
2853
+ kuanr3 k uan3 rr
2854
+ kuanr4 k uan4 rr
2855
+ kuanr5 k uan5 rr
2856
+ kuangr1 k uang1 rr
2857
+ kuangr2 k uang2 rr
2858
+ kuangr3 k uang3 rr
2859
+ kuangr4 k uang4 rr
2860
+ kuangr5 k uang5 rr
2861
+ kuir1 k uei1 rr
2862
+ kuir2 k uei2 rr
2863
+ kuir3 k uei3 rr
2864
+ kuir4 k uei4 rr
2865
+ kuir5 k uei5 rr
2866
+ kunr1 k uen1 rr
2867
+ kunr2 k uen2 rr
2868
+ kunr3 k uen3 rr
2869
+ kunr4 k uen4 rr
2870
+ kunr5 k uen5 rr
2871
+ kuor1 k uo1 rr
2872
+ kuor2 k uo2 rr
2873
+ kuor3 k uo3 rr
2874
+ kuor4 k uo4 rr
2875
+ kuor5 k uo5 rr
2876
+ lar1 l a1 rr
2877
+ lar2 l a2 rr
2878
+ lar3 l a3 rr
2879
+ lar4 l a4 rr
2880
+ lar5 l a5 rr
2881
+ lair1 l ai1 rr
2882
+ lair2 l ai2 rr
2883
+ lair3 l ai3 rr
2884
+ lair4 l ai4 rr
2885
+ lair5 l ai5 rr
2886
+ lanr1 l an1 rr
2887
+ lanr2 l an2 rr
2888
+ lanr3 l an3 rr
2889
+ lanr4 l an4 rr
2890
+ lanr5 l an5 rr
2891
+ langr1 l ang1 rr
2892
+ langr2 l ang2 rr
2893
+ langr3 l ang3 rr
2894
+ langr4 l ang4 rr
2895
+ langr5 l ang5 rr
2896
+ laor1 l ao1 rr
2897
+ laor2 l ao2 rr
2898
+ laor3 l ao3 rr
2899
+ laor4 l ao4 rr
2900
+ laor5 l ao5 rr
2901
+ ler1 l e1 rr
2902
+ ler2 l e2 rr
2903
+ ler3 l e3 rr
2904
+ ler4 l e4 rr
2905
+ ler5 l e5 rr
2906
+ leir1 l ei1 rr
2907
+ leir2 l ei2 rr
2908
+ leir3 l ei3 rr
2909
+ leir4 l ei4 rr
2910
+ leir5 l ei5 rr
2911
+ lengr1 l eng1 rr
2912
+ lengr2 l eng2 rr
2913
+ lengr3 l eng3 rr
2914
+ lengr4 l eng4 rr
2915
+ lengr5 l eng5 rr
2916
+ lir1 l i1 rr
2917
+ lir2 l i2 rr
2918
+ lir3 l i3 rr
2919
+ lir4 l i4 rr
2920
+ lir5 l i5 rr
2921
+ liar1 l ia1 rr
2922
+ liar2 l ia2 rr
2923
+ liar3 l ia3 rr
2924
+ liar4 l ia4 rr
2925
+ liar5 l ia5 rr
2926
+ lianr1 l ian1 rr
2927
+ lianr2 l ian2 rr
2928
+ lianr3 l ian3 rr
2929
+ lianr4 l ian4 rr
2930
+ lianr5 l ian5 rr
2931
+ liangr1 l iang1 rr
2932
+ liangr2 l iang2 rr
2933
+ liangr3 l iang3 rr
2934
+ liangr4 l iang4 rr
2935
+ liangr5 l iang5 rr
2936
+ liaor1 l iao1 rr
2937
+ liaor2 l iao2 rr
2938
+ liaor3 l iao3 rr
2939
+ liaor4 l iao4 rr
2940
+ liaor5 l iao5 rr
2941
+ lier1 l ie1 rr
2942
+ lier2 l ie2 rr
2943
+ lier3 l ie3 rr
2944
+ lier4 l ie4 rr
2945
+ lier5 l ie5 rr
2946
+ linr1 l in1 rr
2947
+ linr2 l in2 rr
2948
+ linr3 l in3 rr
2949
+ linr4 l in4 rr
2950
+ linr5 l in5 rr
2951
+ lingr1 l ing1 rr
2952
+ lingr2 l ing2 rr
2953
+ lingr3 l ing3 rr
2954
+ lingr4 l ing4 rr
2955
+ lingr5 l ing5 rr
2956
+ liur1 l iou1 rr
2957
+ liur2 l iou2 rr
2958
+ liur3 l iou3 rr
2959
+ liur4 l iou4 rr
2960
+ liur5 l iou5 rr
2961
+ lor1 l o1 rr
2962
+ lor2 l o2 rr
2963
+ lor3 l o3 rr
2964
+ lor4 l o4 rr
2965
+ lor5 l o5 rr
2966
+ longr1 l ong1 rr
2967
+ longr2 l ong2 rr
2968
+ longr3 l ong3 rr
2969
+ longr4 l ong4 rr
2970
+ longr5 l ong5 rr
2971
+ lour1 l ou1 rr
2972
+ lour2 l ou2 rr
2973
+ lour3 l ou3 rr
2974
+ lour4 l ou4 rr
2975
+ lour5 l ou5 rr
2976
+ lur1 l u1 rr
2977
+ lur2 l u2 rr
2978
+ lur3 l u3 rr
2979
+ lur4 l u4 rr
2980
+ lur5 l u5 rr
2981
+ luanr1 l uan1 rr
2982
+ luanr2 l uan2 rr
2983
+ luanr3 l uan3 rr
2984
+ luanr4 l uan4 rr
2985
+ luanr5 l uan5 rr
2986
+ luer1 l ve1 rr
2987
+ luer2 l ve2 rr
2988
+ luer3 l ve3 rr
2989
+ luer4 l ve4 rr
2990
+ luer5 l ve5 rr
2991
+ lver1 l ve1 rr
2992
+ lver2 l ve2 rr
2993
+ lver3 l ve3 rr
2994
+ lver4 l ve4 rr
2995
+ lver5 l ve5 rr
2996
+ lunr1 l uen1 rr
2997
+ lunr2 l uen2 rr
2998
+ lunr3 l uen3 rr
2999
+ lunr4 l uen4 rr
3000
+ lunr5 l uen5 rr
3001
+ luor1 l uo1 rr
3002
+ luor2 l uo2 rr
3003
+ luor3 l uo3 rr
3004
+ luor4 l uo4 rr
3005
+ luor5 l uo5 rr
3006
+ lvr1 l v1 rr
3007
+ lvr2 l v2 rr
3008
+ lvr3 l v3 rr
3009
+ lvr4 l v4 rr
3010
+ lvr5 l v5 rr
3011
+ mar1 m a1 rr
3012
+ mar2 m a2 rr
3013
+ mar3 m a3 rr
3014
+ mar4 m a4 rr
3015
+ mar5 m a5 rr
3016
+ mair1 m ai1 rr
3017
+ mair2 m ai2 rr
3018
+ mair3 m ai3 rr
3019
+ mair4 m ai4 rr
3020
+ mair5 m ai5 rr
3021
+ manr1 m an1 rr
3022
+ manr2 m an2 rr
3023
+ manr3 m an3 rr
3024
+ manr4 m an4 rr
3025
+ manr5 m an5 rr
3026
+ mangr1 m ang1 rr
3027
+ mangr2 m ang2 rr
3028
+ mangr3 m ang3 rr
3029
+ mangr4 m ang4 rr
3030
+ mangr5 m ang5 rr
3031
+ maor1 m ao1 rr
3032
+ maor2 m ao2 rr
3033
+ maor3 m ao3 rr
3034
+ maor4 m ao4 rr
3035
+ maor5 m ao5 rr
3036
+ mer1 m e1 rr
3037
+ mer2 m e2 rr
3038
+ mer3 m e3 rr
3039
+ mer4 m e4 rr
3040
+ mer5 m e5 rr
3041
+ meir1 m ei1 rr
3042
+ meir2 m ei2 rr
3043
+ meir3 m ei3 rr
3044
+ meir4 m ei4 rr
3045
+ meir5 m ei5 rr
3046
+ menr1 m en1 rr
3047
+ menr2 m en2 rr
3048
+ menr3 m en3 rr
3049
+ menr4 m en4 rr
3050
+ menr5 m en5 rr
3051
+ mengr1 m eng1 rr
3052
+ mengr2 m eng2 rr
3053
+ mengr3 m eng3 rr
3054
+ mengr4 m eng4 rr
3055
+ mengr5 m eng5 rr
3056
+ mir1 m i1 rr
3057
+ mir2 m i2 rr
3058
+ mir3 m i3 rr
3059
+ mir4 m i4 rr
3060
+ mir5 m i5 rr
3061
+ mianr1 m ian1 rr
3062
+ mianr2 m ian2 rr
3063
+ mianr3 m ian3 rr
3064
+ mianr4 m ian4 rr
3065
+ mianr5 m ian5 rr
3066
+ miaor1 m iao1 rr
3067
+ miaor2 m iao2 rr
3068
+ miaor3 m iao3 rr
3069
+ miaor4 m iao4 rr
3070
+ miaor5 m iao5 rr
3071
+ mier1 m ie1 rr
3072
+ mier2 m ie2 rr
3073
+ mier3 m ie3 rr
3074
+ mier4 m ie4 rr
3075
+ mier5 m ie5 rr
3076
+ minr1 m in1 rr
3077
+ minr2 m in2 rr
3078
+ minr3 m in3 rr
3079
+ minr4 m in4 rr
3080
+ minr5 m in5 rr
3081
+ mingr1 m ing1 rr
3082
+ mingr2 m ing2 rr
3083
+ mingr3 m ing3 rr
3084
+ mingr4 m ing4 rr
3085
+ mingr5 m ing5 rr
3086
+ miur1 m iou1 rr
3087
+ miur2 m iou2 rr
3088
+ miur3 m iou3 rr
3089
+ miur4 m iou4 rr
3090
+ miur5 m iou5 rr
3091
+ mor1 m o1 rr
3092
+ mor2 m o2 rr
3093
+ mor3 m o3 rr
3094
+ mor4 m o4 rr
3095
+ mor5 m o5 rr
3096
+ mour1 m ou1 rr
3097
+ mour2 m ou2 rr
3098
+ mour3 m ou3 rr
3099
+ mour4 m ou4 rr
3100
+ mour5 m ou5 rr
3101
+ mur1 m u1 rr
3102
+ mur2 m u2 rr
3103
+ mur3 m u3 rr
3104
+ mur4 m u4 rr
3105
+ mur5 m u5 rr
3106
+ nar1 n a1 rr
3107
+ nar2 n a2 rr
3108
+ nar3 n a3 rr
3109
+ nar4 n a4 rr
3110
+ nar5 n a5 rr
3111
+ nair1 n ai1 rr
3112
+ nair2 n ai2 rr
3113
+ nair3 n ai3 rr
3114
+ nair4 n ai4 rr
3115
+ nair5 n ai5 rr
3116
+ nanr1 n an1 rr
3117
+ nanr2 n an2 rr
3118
+ nanr3 n an3 rr
3119
+ nanr4 n an4 rr
3120
+ nanr5 n an5 rr
3121
+ nangr1 n ang1 rr
3122
+ nangr2 n ang2 rr
3123
+ nangr3 n ang3 rr
3124
+ nangr4 n ang4 rr
3125
+ nangr5 n ang5 rr
3126
+ naor1 n ao1 rr
3127
+ naor2 n ao2 rr
3128
+ naor3 n ao3 rr
3129
+ naor4 n ao4 rr
3130
+ naor5 n ao5 rr
3131
+ ner1 n e1 rr
3132
+ ner2 n e2 rr
3133
+ ner3 n e3 rr
3134
+ ner4 n e4 rr
3135
+ ner5 n e5 rr
3136
+ neir1 n ei1 rr
3137
+ neir2 n ei2 rr
3138
+ neir3 n ei3 rr
3139
+ neir4 n ei4 rr
3140
+ neir5 n ei5 rr
3141
+ nenr1 n en1 rr
3142
+ nenr2 n en2 rr
3143
+ nenr3 n en3 rr
3144
+ nenr4 n en4 rr
3145
+ nenr5 n en5 rr
3146
+ nengr1 n eng1 rr
3147
+ nengr2 n eng2 rr
3148
+ nengr3 n eng3 rr
3149
+ nengr4 n eng4 rr
3150
+ nengr5 n eng5 rr
3151
+ nir1 n i1 rr
3152
+ nir2 n i2 rr
3153
+ nir3 n i3 rr
3154
+ nir4 n i4 rr
3155
+ nir5 n i5 rr
3156
+ nianr1 n ian1 rr
3157
+ nianr2 n ian2 rr
3158
+ nianr3 n ian3 rr
3159
+ nianr4 n ian4 rr
3160
+ nianr5 n ian5 rr
3161
+ niangr1 n iang1 rr
3162
+ niangr2 n iang2 rr
3163
+ niangr3 n iang3 rr
3164
+ niangr4 n iang4 rr
3165
+ niangr5 n iang5 rr
3166
+ niaor1 n iao1 rr
3167
+ niaor2 n iao2 rr
3168
+ niaor3 n iao3 rr
3169
+ niaor4 n iao4 rr
3170
+ niaor5 n iao5 rr
3171
+ nier1 n ie1 rr
3172
+ nier2 n ie2 rr
3173
+ nier3 n ie3 rr
3174
+ nier4 n ie4 rr
3175
+ nier5 n ie5 rr
3176
+ ninr1 n in1 rr
3177
+ ninr2 n in2 rr
3178
+ ninr3 n in3 rr
3179
+ ninr4 n in4 rr
3180
+ ninr5 n in5 rr
3181
+ ningr1 n ing1 rr
3182
+ ningr2 n ing2 rr
3183
+ ningr3 n ing3 rr
3184
+ ningr4 n ing4 rr
3185
+ ningr5 n ing5 rr
3186
+ niur1 n iou1 rr
3187
+ niur2 n iou2 rr
3188
+ niur3 n iou3 rr
3189
+ niur4 n iou4 rr
3190
+ niur5 n iou5 rr
3191
+ nongr1 n ong1 rr
3192
+ nongr2 n ong2 rr
3193
+ nongr3 n ong3 rr
3194
+ nongr4 n ong4 rr
3195
+ nongr5 n ong5 rr
3196
+ nour1 n ou1 rr
3197
+ nour2 n ou2 rr
3198
+ nour3 n ou3 rr
3199
+ nour4 n ou4 rr
3200
+ nour5 n ou5 rr
3201
+ nur1 n u1 rr
3202
+ nur2 n u2 rr
3203
+ nur3 n u3 rr
3204
+ nur4 n u4 rr
3205
+ nur5 n u5 rr
3206
+ nuanr1 n uan1 rr
3207
+ nuanr2 n uan2 rr
3208
+ nuanr3 n uan3 rr
3209
+ nuanr4 n uan4 rr
3210
+ nuanr5 n uan5 rr
3211
+ nuer1 n ve1 rr
3212
+ nuer2 n ve2 rr
3213
+ nuer3 n ve3 rr
3214
+ nuer4 n ve4 rr
3215
+ nuer5 n ve5 rr
3216
+ nver1 n ve1 rr
3217
+ nver2 n ve2 rr
3218
+ nver3 n ve3 rr
3219
+ nver4 n ve4 rr
3220
+ nver5 n ve5 rr
3221
+ nuor1 n uo1 rr
3222
+ nuor2 n uo2 rr
3223
+ nuor3 n uo3 rr
3224
+ nuor4 n uo4 rr
3225
+ nuor5 n uo5 rr
3226
+ nvr1 n v1 rr
3227
+ nvr2 n v2 rr
3228
+ nvr3 n v3 rr
3229
+ nvr4 n v4 rr
3230
+ nvr5 n v5 rr
3231
+ or1 o1 rr
3232
+ or2 o2 rr
3233
+ or3 o3 rr
3234
+ or4 o4 rr
3235
+ or5 o5 rr
3236
+ our1 ou1 rr
3237
+ our2 ou2 rr
3238
+ our3 ou3 rr
3239
+ our4 ou4 rr
3240
+ our5 ou5 rr
3241
+ par1 p a1 rr
3242
+ par2 p a2 rr
3243
+ par3 p a3 rr
3244
+ par4 p a4 rr
3245
+ par5 p a5 rr
3246
+ pair1 p ai1 rr
3247
+ pair2 p ai2 rr
3248
+ pair3 p ai3 rr
3249
+ pair4 p ai4 rr
3250
+ pair5 p ai5 rr
3251
+ panr1 p an1 rr
3252
+ panr2 p an2 rr
3253
+ panr3 p an3 rr
3254
+ panr4 p an4 rr
3255
+ panr5 p an5 rr
3256
+ pangr1 p ang1 rr
3257
+ pangr2 p ang2 rr
3258
+ pangr3 p ang3 rr
3259
+ pangr4 p ang4 rr
3260
+ pangr5 p ang5 rr
3261
+ paor1 p ao1 rr
3262
+ paor2 p ao2 rr
3263
+ paor3 p ao3 rr
3264
+ paor4 p ao4 rr
3265
+ paor5 p ao5 rr
3266
+ peir1 p ei1 rr
3267
+ peir2 p ei2 rr
3268
+ peir3 p ei3 rr
3269
+ peir4 p ei4 rr
3270
+ peir5 p ei5 rr
3271
+ penr1 p en1 rr
3272
+ penr2 p en2 rr
3273
+ penr3 p en3 rr
3274
+ penr4 p en4 rr
3275
+ penr5 p en5 rr
3276
+ pengr1 p eng1 rr
3277
+ pengr2 p eng2 rr
3278
+ pengr3 p eng3 rr
3279
+ pengr4 p eng4 rr
3280
+ pengr5 p eng5 rr
3281
+ pir1 p i1 rr
3282
+ pir2 p i2 rr
3283
+ pir3 p i3 rr
3284
+ pir4 p i4 rr
3285
+ pir5 p i5 rr
3286
+ pianr1 p ian1 rr
3287
+ pianr2 p ian2 rr
3288
+ pianr3 p ian3 rr
3289
+ pianr4 p ian4 rr
3290
+ pianr5 p ian5 rr
3291
+ piaor1 p iao1 rr
3292
+ piaor2 p iao2 rr
3293
+ piaor3 p iao3 rr
3294
+ piaor4 p iao4 rr
3295
+ piaor5 p iao5 rr
3296
+ pier1 p ie1 rr
3297
+ pier2 p ie2 rr
3298
+ pier3 p ie3 rr
3299
+ pier4 p ie4 rr
3300
+ pier5 p ie5 rr
3301
+ pinr1 p in1 rr
3302
+ pinr2 p in2 rr
3303
+ pinr3 p in3 rr
3304
+ pinr4 p in4 rr
3305
+ pinr5 p in5 rr
3306
+ pingr1 p ing1 rr
3307
+ pingr2 p ing2 rr
3308
+ pingr3 p ing3 rr
3309
+ pingr4 p ing4 rr
3310
+ pingr5 p ing5 rr
3311
+ por1 p o1 rr
3312
+ por2 p o2 rr
3313
+ por3 p o3 rr
3314
+ por4 p o4 rr
3315
+ por5 p o5 rr
3316
+ pour1 p ou1 rr
3317
+ pour2 p ou2 rr
3318
+ pour3 p ou3 rr
3319
+ pour4 p ou4 rr
3320
+ pour5 p ou5 rr
3321
+ pur1 p u1 rr
3322
+ pur2 p u2 rr
3323
+ pur3 p u3 rr
3324
+ pur4 p u4 rr
3325
+ pur5 p u5 rr
3326
+ qir1 q i1 rr
3327
+ qir2 q i2 rr
3328
+ qir3 q i3 rr
3329
+ qir4 q i4 rr
3330
+ qir5 q i5 rr
3331
+ qiar1 q ia1 rr
3332
+ qiar2 q ia2 rr
3333
+ qiar3 q ia3 rr
3334
+ qiar4 q ia4 rr
3335
+ qiar5 q ia5 rr
3336
+ qianr1 q ian1 rr
3337
+ qianr2 q ian2 rr
3338
+ qianr3 q ian3 rr
3339
+ qianr4 q ian4 rr
3340
+ qianr5 q ian5 rr
3341
+ qiangr1 q iang1 rr
3342
+ qiangr2 q iang2 rr
3343
+ qiangr3 q iang3 rr
3344
+ qiangr4 q iang4 rr
3345
+ qiangr5 q iang5 rr
3346
+ qiaor1 q iao1 rr
3347
+ qiaor2 q iao2 rr
3348
+ qiaor3 q iao3 rr
3349
+ qiaor4 q iao4 rr
3350
+ qiaor5 q iao5 rr
3351
+ qier1 q ie1 rr
3352
+ qier2 q ie2 rr
3353
+ qier3 q ie3 rr
3354
+ qier4 q ie4 rr
3355
+ qier5 q ie5 rr
3356
+ qinr1 q in1 rr
3357
+ qinr2 q in2 rr
3358
+ qinr3 q in3 rr
3359
+ qinr4 q in4 rr
3360
+ qinr5 q in5 rr
3361
+ qingr1 q ing1 rr
3362
+ qingr2 q ing2 rr
3363
+ qingr3 q ing3 rr
3364
+ qingr4 q ing4 rr
3365
+ qingr5 q ing5 rr
3366
+ qiongr1 q iong1 rr
3367
+ qiongr2 q iong2 rr
3368
+ qiongr3 q iong3 rr
3369
+ qiongr4 q iong4 rr
3370
+ qiongr5 q iong5 rr
3371
+ qiur1 q iou1 rr
3372
+ qiur2 q iou2 rr
3373
+ qiur3 q iou3 rr
3374
+ qiur4 q iou4 rr
3375
+ qiur5 q iou5 rr
3376
+ qur1 q v1 rr
3377
+ qur2 q v2 rr
3378
+ qur3 q v3 rr
3379
+ qur4 q v4 rr
3380
+ qur5 q v5 rr
3381
+ quanr1 q van1 rr
3382
+ quanr2 q van2 rr
3383
+ quanr3 q van3 rr
3384
+ quanr4 q van4 rr
3385
+ quanr5 q van5 rr
3386
+ quer1 q ve1 rr
3387
+ quer2 q ve2 rr
3388
+ quer3 q ve3 rr
3389
+ quer4 q ve4 rr
3390
+ quer5 q ve5 rr
3391
+ qunr1 q vn1 rr
3392
+ qunr2 q vn2 rr
3393
+ qunr3 q vn3 rr
3394
+ qunr4 q vn4 rr
3395
+ qunr5 q vn5 rr
3396
+ ranr1 r an1 rr
3397
+ ranr2 r an2 rr
3398
+ ranr3 r an3 rr
3399
+ ranr4 r an4 rr
3400
+ ranr5 r an5 rr
3401
+ rangr1 r ang1 rr
3402
+ rangr2 r ang2 rr
3403
+ rangr3 r ang3 rr
3404
+ rangr4 r ang4 rr
3405
+ rangr5 r ang5 rr
3406
+ raor1 r ao1 rr
3407
+ raor2 r ao2 rr
3408
+ raor3 r ao3 rr
3409
+ raor4 r ao4 rr
3410
+ raor5 r ao5 rr
3411
+ rer1 r e1 rr
3412
+ rer2 r e2 rr
3413
+ rer3 r e3 rr
3414
+ rer4 r e4 rr
3415
+ rer5 r e5 rr
3416
+ renr1 r en1 rr
3417
+ renr2 r en2 rr
3418
+ renr3 r en3 rr
3419
+ renr4 r en4 rr
3420
+ renr5 r en5 rr
3421
+ rengr1 r eng1 rr
3422
+ rengr2 r eng2 rr
3423
+ rengr3 r eng3 rr
3424
+ rengr4 r eng4 rr
3425
+ rengr5 r eng5 rr
3426
+ rir1 r iii1 rr
3427
+ rir2 r iii2 rr
3428
+ rir3 r iii3 rr
3429
+ rir4 r iii4 rr
3430
+ rir5 r iii5 rr
3431
+ rongr1 r ong1 rr
3432
+ rongr2 r ong2 rr
3433
+ rongr3 r ong3 rr
3434
+ rongr4 r ong4 rr
3435
+ rongr5 r ong5 rr
3436
+ rour1 r ou1 rr
3437
+ rour2 r ou2 rr
3438
+ rour3 r ou3 rr
3439
+ rour4 r ou4 rr
3440
+ rour5 r ou5 rr
3441
+ rur1 r u1 rr
3442
+ rur2 r u2 rr
3443
+ rur3 r u3 rr
3444
+ rur4 r u4 rr
3445
+ rur5 r u5 rr
3446
+ ruar1 r ua1 rr
3447
+ ruar2 r ua2 rr
3448
+ ruar3 r ua3 rr
3449
+ ruar4 r ua4 rr
3450
+ ruar5 r ua5 rr
3451
+ ruanr1 r uan1 rr
3452
+ ruanr2 r uan2 rr
3453
+ ruanr3 r uan3 rr
3454
+ ruanr4 r uan4 rr
3455
+ ruanr5 r uan5 rr
3456
+ ruir1 r uei1 rr
3457
+ ruir2 r uei2 rr
3458
+ ruir3 r uei3 rr
3459
+ ruir4 r uei4 rr
3460
+ ruir5 r uei5 rr
3461
+ runr1 r uen1 rr
3462
+ runr2 r uen2 rr
3463
+ runr3 r uen3 rr
3464
+ runr4 r uen4 rr
3465
+ runr5 r uen5 rr
3466
+ ruor1 r uo1 rr
3467
+ ruor2 r uo2 rr
3468
+ ruor3 r uo3 rr
3469
+ ruor4 r uo4 rr
3470
+ ruor5 r uo5 rr
3471
+ sar1 s a1 rr
3472
+ sar2 s a2 rr
3473
+ sar3 s a3 rr
3474
+ sar4 s a4 rr
3475
+ sar5 s a5 rr
3476
+ sair1 s ai1 rr
3477
+ sair2 s ai2 rr
3478
+ sair3 s ai3 rr
3479
+ sair4 s ai4 rr
3480
+ sair5 s ai5 rr
3481
+ sanr1 s an1 rr
3482
+ sanr2 s an2 rr
3483
+ sanr3 s an3 rr
3484
+ sanr4 s an4 rr
3485
+ sanr5 s an5 rr
3486
+ sangr1 s ang1 rr
3487
+ sangr2 s ang2 rr
3488
+ sangr3 s ang3 rr
3489
+ sangr4 s ang4 rr
3490
+ sangr5 s ang5 rr
3491
+ saor1 s ao1 rr
3492
+ saor2 s ao2 rr
3493
+ saor3 s ao3 rr
3494
+ saor4 s ao4 rr
3495
+ saor5 s ao5 rr
3496
+ ser1 s e1 rr
3497
+ ser2 s e2 rr
3498
+ ser3 s e3 rr
3499
+ ser4 s e4 rr
3500
+ ser5 s e5 rr
3501
+ senr1 s en1 rr
3502
+ senr2 s en2 rr
3503
+ senr3 s en3 rr
3504
+ senr4 s en4 rr
3505
+ senr5 s en5 rr
3506
+ sengr1 s eng1 rr
3507
+ sengr2 s eng2 rr
3508
+ sengr3 s eng3 rr
3509
+ sengr4 s eng4 rr
3510
+ sengr5 s eng5 rr
3511
+ shar1 sh a1 rr
3512
+ shar2 sh a2 rr
3513
+ shar3 sh a3 rr
3514
+ shar4 sh a4 rr
3515
+ shar5 sh a5 rr
3516
+ shair1 sh ai1 rr
3517
+ shair2 sh ai2 rr
3518
+ shair3 sh ai3 rr
3519
+ shair4 sh ai4 rr
3520
+ shair5 sh ai5 rr
3521
+ shanr1 sh an1 rr
3522
+ shanr2 sh an2 rr
3523
+ shanr3 sh an3 rr
3524
+ shanr4 sh an4 rr
3525
+ shanr5 sh an5 rr
3526
+ shangr1 sh ang1 rr
3527
+ shangr2 sh ang2 rr
3528
+ shangr3 sh ang3 rr
3529
+ shangr4 sh ang4 rr
3530
+ shangr5 sh ang5 rr
3531
+ shaor1 sh ao1 rr
3532
+ shaor2 sh ao2 rr
3533
+ shaor3 sh ao3 rr
3534
+ shaor4 sh ao4 rr
3535
+ shaor5 sh ao5 rr
3536
+ sher1 sh e1 rr
3537
+ sher2 sh e2 rr
3538
+ sher3 sh e3 rr
3539
+ sher4 sh e4 rr
3540
+ sher5 sh e5 rr
3541
+ sheir1 sh ei1 rr
3542
+ sheir2 sh ei2 rr
3543
+ sheir3 sh ei3 rr
3544
+ sheir4 sh ei4 rr
3545
+ sheir5 sh ei5 rr
3546
+ shenr1 sh en1 rr
3547
+ shenr2 sh en2 rr
3548
+ shenr3 sh en3 rr
3549
+ shenr4 sh en4 rr
3550
+ shenr5 sh en5 rr
3551
+ shengr1 sh eng1 rr
3552
+ shengr2 sh eng2 rr
3553
+ shengr3 sh eng3 rr
3554
+ shengr4 sh eng4 rr
3555
+ shengr5 sh eng5 rr
3556
+ shir1 sh iii1 rr
3557
+ shir2 sh iii2 rr
3558
+ shir3 sh iii3 rr
3559
+ shir4 sh iii4 rr
3560
+ shir5 sh iii5 rr
3561
+ shour1 sh ou1 rr
3562
+ shour2 sh ou2 rr
3563
+ shour3 sh ou3 rr
3564
+ shour4 sh ou4 rr
3565
+ shour5 sh ou5 rr
3566
+ shur1 sh u1 rr
3567
+ shur2 sh u2 rr
3568
+ shur3 sh u3 rr
3569
+ shur4 sh u4 rr
3570
+ shur5 sh u5 rr
3571
+ shuar1 sh ua1 rr
3572
+ shuar2 sh ua2 rr
3573
+ shuar3 sh ua3 rr
3574
+ shuar4 sh ua4 rr
3575
+ shuar5 sh ua5 rr
3576
+ shuair1 sh uai1 rr
3577
+ shuair2 sh uai2 rr
3578
+ shuair3 sh uai3 rr
3579
+ shuair4 sh uai4 rr
3580
+ shuair5 sh uai5 rr
3581
+ shuanr1 sh uan1 rr
3582
+ shuanr2 sh uan2 rr
3583
+ shuanr3 sh uan3 rr
3584
+ shuanr4 sh uan4 rr
3585
+ shuanr5 sh uan5 rr
3586
+ shuangr1 sh uang1 rr
3587
+ shuangr2 sh uang2 rr
3588
+ shuangr3 sh uang3 rr
3589
+ shuangr4 sh uang4 rr
3590
+ shuangr5 sh uang5 rr
3591
+ shuir1 sh uei1 rr
3592
+ shuir2 sh uei2 rr
3593
+ shuir3 sh uei3 rr
3594
+ shuir4 sh uei4 rr
3595
+ shuir5 sh uei5 rr
3596
+ shunr1 sh uen1 rr
3597
+ shunr2 sh uen2 rr
3598
+ shunr3 sh uen3 rr
3599
+ shunr4 sh uen4 rr
3600
+ shunr5 sh uen5 rr
3601
+ shuor1 sh uo1 rr
3602
+ shuor2 sh uo2 rr
3603
+ shuor3 sh uo3 rr
3604
+ shuor4 sh uo4 rr
3605
+ shuor5 sh uo5 rr
3606
+ sir1 s ii1 rr
3607
+ sir2 s ii2 rr
3608
+ sir3 s ii3 rr
3609
+ sir4 s ii4 rr
3610
+ sir5 s ii5 rr
3611
+ songr1 s ong1 rr
3612
+ songr2 s ong2 rr
3613
+ songr3 s ong3 rr
3614
+ songr4 s ong4 rr
3615
+ songr5 s ong5 rr
3616
+ sour1 s ou1 rr
3617
+ sour2 s ou2 rr
3618
+ sour3 s ou3 rr
3619
+ sour4 s ou4 rr
3620
+ sour5 s ou5 rr
3621
+ sur1 s u1 rr
3622
+ sur2 s u2 rr
3623
+ sur3 s u3 rr
3624
+ sur4 s u4 rr
3625
+ sur5 s u5 rr
3626
+ suanr1 s uan1 rr
3627
+ suanr2 s uan2 rr
3628
+ suanr3 s uan3 rr
3629
+ suanr4 s uan4 rr
3630
+ suanr5 s uan5 rr
3631
+ suir1 s uei1 rr
3632
+ suir2 s uei2 rr
3633
+ suir3 s uei3 rr
3634
+ suir4 s uei4 rr
3635
+ suir5 s uei5 rr
3636
+ sunr1 s uen1 rr
3637
+ sunr2 s uen2 rr
3638
+ sunr3 s uen3 rr
3639
+ sunr4 s uen4 rr
3640
+ sunr5 s uen5 rr
3641
+ suor1 s uo1 rr
3642
+ suor2 s uo2 rr
3643
+ suor3 s uo3 rr
3644
+ suor4 s uo4 rr
3645
+ suor5 s uo5 rr
3646
+ tar1 t a1 rr
3647
+ tar2 t a2 rr
3648
+ tar3 t a3 rr
3649
+ tar4 t a4 rr
3650
+ tar5 t a5 rr
3651
+ tair1 t ai1 rr
3652
+ tair2 t ai2 rr
3653
+ tair3 t ai3 rr
3654
+ tair4 t ai4 rr
3655
+ tair5 t ai5 rr
3656
+ tanr1 t an1 rr
3657
+ tanr2 t an2 rr
3658
+ tanr3 t an3 rr
3659
+ tanr4 t an4 rr
3660
+ tanr5 t an5 rr
3661
+ tangr1 t ang1 rr
3662
+ tangr2 t ang2 rr
3663
+ tangr3 t ang3 rr
3664
+ tangr4 t ang4 rr
3665
+ tangr5 t ang5 rr
3666
+ taor1 t ao1 rr
3667
+ taor2 t ao2 rr
3668
+ taor3 t ao3 rr
3669
+ taor4 t ao4 rr
3670
+ taor5 t ao5 rr
3671
+ ter1 t e1 rr
3672
+ ter2 t e2 rr
3673
+ ter3 t e3 rr
3674
+ ter4 t e4 rr
3675
+ ter5 t e5 rr
3676
+ teir1 t ei1 rr
3677
+ teir2 t ei2 rr
3678
+ teir3 t ei3 rr
3679
+ teir4 t ei4 rr
3680
+ teir5 t ei5 rr
3681
+ tengr1 t eng1 rr
3682
+ tengr2 t eng2 rr
3683
+ tengr3 t eng3 rr
3684
+ tengr4 t eng4 rr
3685
+ tengr5 t eng5 rr
3686
+ tir1 t i1 rr
3687
+ tir2 t i2 rr
3688
+ tir3 t i3 rr
3689
+ tir4 t i4 rr
3690
+ tir5 t i5 rr
3691
+ tianr1 t ian1 rr
3692
+ tianr2 t ian2 rr
3693
+ tianr3 t ian3 rr
3694
+ tianr4 t ian4 rr
3695
+ tianr5 t ian5 rr
3696
+ tiaor1 t iao1 rr
3697
+ tiaor2 t iao2 rr
3698
+ tiaor3 t iao3 rr
3699
+ tiaor4 t iao4 rr
3700
+ tiaor5 t iao5 rr
3701
+ tier1 t ie1 rr
3702
+ tier2 t ie2 rr
3703
+ tier3 t ie3 rr
3704
+ tier4 t ie4 rr
3705
+ tier5 t ie5 rr
3706
+ tingr1 t ing1 rr
3707
+ tingr2 t ing2 rr
3708
+ tingr3 t ing3 rr
3709
+ tingr4 t ing4 rr
3710
+ tingr5 t ing5 rr
3711
+ tongr1 t ong1 rr
3712
+ tongr2 t ong2 rr
3713
+ tongr3 t ong3 rr
3714
+ tongr4 t ong4 rr
3715
+ tongr5 t ong5 rr
3716
+ tour1 t ou1 rr
3717
+ tour2 t ou2 rr
3718
+ tour3 t ou3 rr
3719
+ tour4 t ou4 rr
3720
+ tour5 t ou5 rr
3721
+ tur1 t u1 rr
3722
+ tur2 t u2 rr
3723
+ tur3 t u3 rr
3724
+ tur4 t u4 rr
3725
+ tur5 t u5 rr
3726
+ tuanr1 t uan1 rr
3727
+ tuanr2 t uan2 rr
3728
+ tuanr3 t uan3 rr
3729
+ tuanr4 t uan4 rr
3730
+ tuanr5 t uan5 rr
3731
+ tuir1 t uei1 rr
3732
+ tuir2 t uei2 rr
3733
+ tuir3 t uei3 rr
3734
+ tuir4 t uei4 rr
3735
+ tuir5 t uei5 rr
3736
+ tunr1 t uen1 rr
3737
+ tunr2 t uen2 rr
3738
+ tunr3 t uen3 rr
3739
+ tunr4 t uen4 rr
3740
+ tunr5 t uen5 rr
3741
+ tuor1 t uo1 rr
3742
+ tuor2 t uo2 rr
3743
+ tuor3 t uo3 rr
3744
+ tuor4 t uo4 rr
3745
+ tuor5 t uo5 rr
3746
+ war1 w ua1 rr
3747
+ war2 w ua2 rr
3748
+ war3 w ua3 rr
3749
+ war4 w ua4 rr
3750
+ war5 w ua5 rr
3751
+ wair1 w uai1 rr
3752
+ wair2 w uai2 rr
3753
+ wair3 w uai3 rr
3754
+ wair4 w uai4 rr
3755
+ wair5 w uai5 rr
3756
+ wanr1 w uan1 rr
3757
+ wanr2 w uan2 rr
3758
+ wanr3 w uan3 rr
3759
+ wanr4 w uan4 rr
3760
+ wanr5 w uan5 rr
3761
+ wangr1 w uang1 rr
3762
+ wangr2 w uang2 rr
3763
+ wangr3 w uang3 rr
3764
+ wangr4 w uang4 rr
3765
+ wangr5 w uang5 rr
3766
+ weir1 w uei1 rr
3767
+ weir2 w uei2 rr
3768
+ weir3 w uei3 rr
3769
+ weir4 w uei4 rr
3770
+ weir5 w uei5 rr
3771
+ wenr1 w uen1 rr
3772
+ wenr2 w uen2 rr
3773
+ wenr3 w uen3 rr
3774
+ wenr4 w uen4 rr
3775
+ wenr5 w uen5 rr
3776
+ wengr1 w uen1 rr
3777
+ wengr2 w uen2 rr
3778
+ wengr3 w uen3 rr
3779
+ wengr4 w uen4 rr
3780
+ wengr5 w uen5 rr
3781
+ wor1 w uo1 rr
3782
+ wor2 w uo2 rr
3783
+ wor3 w uo3 rr
3784
+ wor4 w uo4 rr
3785
+ wor5 w uo5 rr
3786
+ wur1 w u1 rr
3787
+ wur2 w u2 rr
3788
+ wur3 w u3 rr
3789
+ wur4 w u4 rr
3790
+ wur5 w u5 rr
3791
+ xir1 x i1 rr
3792
+ xir2 x i2 rr
3793
+ xir3 x i3 rr
3794
+ xir4 x i4 rr
3795
+ xir5 x i5 rr
3796
+ xiar1 x ia1 rr
3797
+ xiar2 x ia2 rr
3798
+ xiar3 x ia3 rr
3799
+ xiar4 x ia4 rr
3800
+ xiar5 x ia5 rr
3801
+ xianr1 x ian1 rr
3802
+ xianr2 x ian2 rr
3803
+ xianr3 x ian3 rr
3804
+ xianr4 x ian4 rr
3805
+ xianr5 x ian5 rr
3806
+ xiangr1 x iang1 rr
3807
+ xiangr2 x iang2 rr
3808
+ xiangr3 x iang3 rr
3809
+ xiangr4 x iang4 rr
3810
+ xiangr5 x iang5 rr
3811
+ xiaor1 x iao1 rr
3812
+ xiaor2 x iao2 rr
3813
+ xiaor3 x iao3 rr
3814
+ xiaor4 x iao4 rr
3815
+ xiaor5 x iao5 rr
3816
+ xier1 x ie1 rr
3817
+ xier2 x ie2 rr
3818
+ xier3 x ie3 rr
3819
+ xier4 x ie4 rr
3820
+ xier5 x ie5 rr
3821
+ xinr1 x in1 rr
3822
+ xinr2 x in2 rr
3823
+ xinr3 x in3 rr
3824
+ xinr4 x in4 rr
3825
+ xinr5 x in5 rr
3826
+ xingr1 x ing1 rr
3827
+ xingr2 x ing2 rr
3828
+ xingr3 x ing3 rr
3829
+ xingr4 x ing4 rr
3830
+ xingr5 x ing5 rr
3831
+ xiongr1 x iong1 rr
3832
+ xiongr2 x iong2 rr
3833
+ xiongr3 x iong3 rr
3834
+ xiongr4 x iong4 rr
3835
+ xiongr5 x iong5 rr
3836
+ xiur1 x iou1 rr
3837
+ xiur2 x iou2 rr
3838
+ xiur3 x iou3 rr
3839
+ xiur4 x iou4 rr
3840
+ xiur5 x iou5 rr
3841
+ xur1 x v1 rr
3842
+ xur2 x v2 rr
3843
+ xur3 x v3 rr
3844
+ xur4 x v4 rr
3845
+ xur5 x v5 rr
3846
+ xuanr1 x van1 rr
3847
+ xuanr2 x van2 rr
3848
+ xuanr3 x van3 rr
3849
+ xuanr4 x van4 rr
3850
+ xuanr5 x van5 rr
3851
+ xuer1 x ve1 rr
3852
+ xuer2 x ve2 rr
3853
+ xuer3 x ve3 rr
3854
+ xuer4 x ve4 rr
3855
+ xuer5 x ve5 rr
3856
+ xunr1 x vn1 rr
3857
+ xunr2 x vn2 rr
3858
+ xunr3 x vn3 rr
3859
+ xunr4 x vn4 rr
3860
+ xunr5 x vn5 rr
3861
+ yar1 y ia1 rr
3862
+ yar2 y ia2 rr
3863
+ yar3 y ia3 rr
3864
+ yar4 y ia4 rr
3865
+ yar5 y ia5 rr
3866
+ yanr1 y ian1 rr
3867
+ yanr2 y ian2 rr
3868
+ yanr3 y ian3 rr
3869
+ yanr4 y ian4 rr
3870
+ yanr5 y ian5 rr
3871
+ yangr1 y iang1 rr
3872
+ yangr2 y iang2 rr
3873
+ yangr3 y iang3 rr
3874
+ yangr4 y iang4 rr
3875
+ yangr5 y iang5 rr
3876
+ yaor1 y iao1 rr
3877
+ yaor2 y iao2 rr
3878
+ yaor3 y iao3 rr
3879
+ yaor4 y iao4 rr
3880
+ yaor5 y iao5 rr
3881
+ yer1 y ie1 rr
3882
+ yer2 y ie2 rr
3883
+ yer3 y ie3 rr
3884
+ yer4 y ie4 rr
3885
+ yer5 y ie5 rr
3886
+ yir1 y i1 rr
3887
+ yir2 y i2 rr
3888
+ yir3 y i3 rr
3889
+ yir4 y i4 rr
3890
+ yir5 y i5 rr
3891
+ yinr1 y in1 rr
3892
+ yinr2 y in2 rr
3893
+ yinr3 y in3 rr
3894
+ yinr4 y in4 rr
3895
+ yinr5 y in5 rr
3896
+ yingr1 y ing1 rr
3897
+ yingr2 y ing2 rr
3898
+ yingr3 y ing3 rr
3899
+ yingr4 y ing4 rr
3900
+ yingr5 y ing5 rr
3901
+ yor1 y iou1 rr
3902
+ yor2 y iou2 rr
3903
+ yor3 y iou3 rr
3904
+ yor4 y iou4 rr
3905
+ yor5 y iou5 rr
3906
+ yongr1 y iong1 rr
3907
+ yongr2 y iong2 rr
3908
+ yongr3 y iong3 rr
3909
+ yongr4 y iong4 rr
3910
+ yongr5 y iong5 rr
3911
+ your1 y iou1 rr
3912
+ your2 y iou2 rr
3913
+ your3 y iou3 rr
3914
+ your4 y iou4 rr
3915
+ your5 y iou5 rr
3916
+ yur1 y v1 rr
3917
+ yur2 y v2 rr
3918
+ yur3 y v3 rr
3919
+ yur4 y v4 rr
3920
+ yur5 y v5 rr
3921
+ yuanr1 y van1 rr
3922
+ yuanr2 y van2 rr
3923
+ yuanr3 y van3 rr
3924
+ yuanr4 y van4 rr
3925
+ yuanr5 y van5 rr
3926
+ yuer1 y ve1 rr
3927
+ yuer2 y ve2 rr
3928
+ yuer3 y ve3 rr
3929
+ yuer4 y ve4 rr
3930
+ yuer5 y ve5 rr
3931
+ yunr1 y vn1 rr
3932
+ yunr2 y vn2 rr
3933
+ yunr3 y vn3 rr
3934
+ yunr4 y vn4 rr
3935
+ yunr5 y vn5 rr
3936
+ zar1 z a1 rr
3937
+ zar2 z a2 rr
3938
+ zar3 z a3 rr
3939
+ zar4 z a4 rr
3940
+ zar5 z a5 rr
3941
+ zair1 z ai1 rr
3942
+ zair2 z ai2 rr
3943
+ zair3 z ai3 rr
3944
+ zair4 z ai4 rr
3945
+ zair5 z ai5 rr
3946
+ zanr1 z an1 rr
3947
+ zanr2 z an2 rr
3948
+ zanr3 z an3 rr
3949
+ zanr4 z an4 rr
3950
+ zanr5 z an5 rr
3951
+ zangr1 z ang1 rr
3952
+ zangr2 z ang2 rr
3953
+ zangr3 z ang3 rr
3954
+ zangr4 z ang4 rr
3955
+ zangr5 z ang5 rr
3956
+ zaor1 z ao1 rr
3957
+ zaor2 z ao2 rr
3958
+ zaor3 z ao3 rr
3959
+ zaor4 z ao4 rr
3960
+ zaor5 z ao5 rr
3961
+ zer1 z e1 rr
3962
+ zer2 z e2 rr
3963
+ zer3 z e3 rr
3964
+ zer4 z e4 rr
3965
+ zer5 z e5 rr
3966
+ zeir1 z ei1 rr
3967
+ zeir2 z ei2 rr
3968
+ zeir3 z ei3 rr
3969
+ zeir4 z ei4 rr
3970
+ zeir5 z ei5 rr
3971
+ zenr1 z en1 rr
3972
+ zenr2 z en2 rr
3973
+ zenr3 z en3 rr
3974
+ zenr4 z en4 rr
3975
+ zenr5 z en5 rr
3976
+ zengr1 z eng1 rr
3977
+ zengr2 z eng2 rr
3978
+ zengr3 z eng3 rr
3979
+ zengr4 z eng4 rr
3980
+ zengr5 z eng5 rr
3981
+ zhar1 zh a1 rr
3982
+ zhar2 zh a2 rr
3983
+ zhar3 zh a3 rr
3984
+ zhar4 zh a4 rr
3985
+ zhar5 zh a5 rr
3986
+ zhair1 zh ai1 rr
3987
+ zhair2 zh ai2 rr
3988
+ zhair3 zh ai3 rr
3989
+ zhair4 zh ai4 rr
3990
+ zhair5 zh ai5 rr
3991
+ zhanr1 zh an1 rr
3992
+ zhanr2 zh an2 rr
3993
+ zhanr3 zh an3 rr
3994
+ zhanr4 zh an4 rr
3995
+ zhanr5 zh an5 rr
3996
+ zhangr1 zh ang1 rr
3997
+ zhangr2 zh ang2 rr
3998
+ zhangr3 zh ang3 rr
3999
+ zhangr4 zh ang4 rr
4000
+ zhangr5 zh ang5 rr
4001
+ zhaor1 zh ao1 rr
4002
+ zhaor2 zh ao2 rr
4003
+ zhaor3 zh ao3 rr
4004
+ zhaor4 zh ao4 rr
4005
+ zhaor5 zh ao5 rr
4006
+ zher1 zh e1 rr
4007
+ zher2 zh e2 rr
4008
+ zher3 zh e3 rr
4009
+ zher4 zh e4 rr
4010
+ zher5 zh e5 rr
4011
+ zheir1 zh ei1 rr
4012
+ zheir2 zh ei2 rr
4013
+ zheir3 zh ei3 rr
4014
+ zheir4 zh ei4 rr
4015
+ zheir5 zh ei5 rr
4016
+ zhenr1 zh en1 rr
4017
+ zhenr2 zh en2 rr
4018
+ zhenr3 zh en3 rr
4019
+ zhenr4 zh en4 rr
4020
+ zhenr5 zh en5 rr
4021
+ zhengr1 zh eng1 rr
4022
+ zhengr2 zh eng2 rr
4023
+ zhengr3 zh eng3 rr
4024
+ zhengr4 zh eng4 rr
4025
+ zhengr5 zh eng5 rr
4026
+ zhir1 zh iii1 rr
4027
+ zhir2 zh iii2 rr
4028
+ zhir3 zh iii3 rr
4029
+ zhir4 zh iii4 rr
4030
+ zhir5 zh iii5 rr
4031
+ zhongr1 zh ong1 rr
4032
+ zhongr2 zh ong2 rr
4033
+ zhongr3 zh ong3 rr
4034
+ zhongr4 zh ong4 rr
4035
+ zhongr5 zh ong5 rr
4036
+ zhour1 zh ou1 rr
4037
+ zhour2 zh ou2 rr
4038
+ zhour3 zh ou3 rr
4039
+ zhour4 zh ou4 rr
4040
+ zhour5 zh ou5 rr
4041
+ zhur1 zh u1 rr
4042
+ zhur2 zh u2 rr
4043
+ zhur3 zh u3 rr
4044
+ zhur4 zh u4 rr
4045
+ zhur5 zh u5 rr
4046
+ zhuar1 zh ua1 rr
4047
+ zhuar2 zh ua2 rr
4048
+ zhuar3 zh ua3 rr
4049
+ zhuar4 zh ua4 rr
4050
+ zhuar5 zh ua5 rr
4051
+ zhuair1 zh uai1 rr
4052
+ zhuair2 zh uai2 rr
4053
+ zhuair3 zh uai3 rr
4054
+ zhuair4 zh uai4 rr
4055
+ zhuair5 zh uai5 rr
4056
+ zhuanr1 zh uan1 rr
4057
+ zhuanr2 zh uan2 rr
4058
+ zhuanr3 zh uan3 rr
4059
+ zhuanr4 zh uan4 rr
4060
+ zhuanr5 zh uan5 rr
4061
+ zhuangr1 zh uang1 rr
4062
+ zhuangr2 zh uang2 rr
4063
+ zhuangr3 zh uang3 rr
4064
+ zhuangr4 zh uang4 rr
4065
+ zhuangr5 zh uang5 rr
4066
+ zhuir1 zh uei1 rr
4067
+ zhuir2 zh uei2 rr
4068
+ zhuir3 zh uei3 rr
4069
+ zhuir4 zh uei4 rr
4070
+ zhuir5 zh uei5 rr
4071
+ zhunr1 zh uen1 rr
4072
+ zhunr2 zh uen2 rr
4073
+ zhunr3 zh uen3 rr
4074
+ zhunr4 zh uen4 rr
4075
+ zhunr5 zh uen5 rr
4076
+ zhuor1 zh uo1 rr
4077
+ zhuor2 zh uo2 rr
4078
+ zhuor3 zh uo3 rr
4079
+ zhuor4 zh uo4 rr
4080
+ zhuor5 zh uo5 rr
4081
+ zir1 z ii1 rr
4082
+ zir2 z ii2 rr
4083
+ zir3 z ii3 rr
4084
+ zir4 z ii4 rr
4085
+ zir5 z ii5 rr
4086
+ zongr1 z ong1 rr
4087
+ zongr2 z ong2 rr
4088
+ zongr3 z ong3 rr
4089
+ zongr4 z ong4 rr
4090
+ zongr5 z ong5 rr
4091
+ zour1 z ou1 rr
4092
+ zour2 z ou2 rr
4093
+ zour3 z ou3 rr
4094
+ zour4 z ou4 rr
4095
+ zour5 z ou5 rr
4096
+ zur1 z u1 rr
4097
+ zur2 z u2 rr
4098
+ zur3 z u3 rr
4099
+ zur4 z u4 rr
4100
+ zur5 z u5 rr
4101
+ zuanr1 z uan1 rr
4102
+ zuanr2 z uan2 rr
4103
+ zuanr3 z uan3 rr
4104
+ zuanr4 z uan4 rr
4105
+ zuanr5 z uan5 rr
4106
+ zuir1 z uei1 rr
4107
+ zuir2 z uei2 rr
4108
+ zuir3 z uei3 rr
4109
+ zuir4 z uei4 rr
4110
+ zuir5 z uei5 rr
4111
+ zunr1 z uen1 rr
4112
+ zunr2 z uen2 rr
4113
+ zunr3 z uen3 rr
4114
+ zunr4 z uen4 rr
4115
+ zunr5 z uen5 rr
4116
+ zuor1 z uo1 rr
4117
+ zuor2 z uo2 rr
4118
+ zuor3 z uo3 rr
4119
+ zuor4 z uo4 rr
4120
+ zuor5 z uo5 rr
lemas_tts/infer/text_norm/symbols.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pinyin_dict = {
2
+ "a": ("^", "a"),
3
+ "ai": ("^", "ai"),
4
+ "an": ("^", "an"),
5
+ "ang": ("^", "ang"),
6
+ "ao": ("^", "ao"),
7
+ "ba": ("b", "a"),
8
+ "bai": ("b", "ai"),
9
+ "ban": ("b", "an"),
10
+ "bang": ("b", "ang"),
11
+ "bao": ("b", "ao"),
12
+ "be": ("b", "e"),
13
+ "bei": ("b", "ei"),
14
+ "ben": ("b", "en"),
15
+ "beng": ("b", "eng"),
16
+ "bi": ("b", "i"),
17
+ "bian": ("b", "ian"),
18
+ "biao": ("b", "iao"),
19
+ "bie": ("b", "ie"),
20
+ "bin": ("b", "in"),
21
+ "bing": ("b", "ing"),
22
+ "bo": ("b", "o"),
23
+ "bu": ("b", "u"),
24
+ "ca": ("c", "a"),
25
+ "cai": ("c", "ai"),
26
+ "can": ("c", "an"),
27
+ "cang": ("c", "ang"),
28
+ "cao": ("c", "ao"),
29
+ "ce": ("c", "e"),
30
+ "cen": ("c", "en"),
31
+ "ceng": ("c", "eng"),
32
+ "cha": ("ch", "a"),
33
+ "chai": ("ch", "ai"),
34
+ "chan": ("ch", "an"),
35
+ "chang": ("ch", "ang"),
36
+ "chao": ("ch", "ao"),
37
+ "che": ("ch", "e"),
38
+ "chen": ("ch", "en"),
39
+ "cheng": ("ch", "eng"),
40
+ "chi": ("ch", "iii"),
41
+ "chong": ("ch", "ong"),
42
+ "chou": ("ch", "ou"),
43
+ "chu": ("ch", "u"),
44
+ "chua": ("ch", "ua"),
45
+ "chuai": ("ch", "uai"),
46
+ "chuan": ("ch", "uan"),
47
+ "chuang": ("ch", "uang"),
48
+ "chui": ("ch", "uei"),
49
+ "chun": ("ch", "uen"),
50
+ "chuo": ("ch", "uo"),
51
+ "ci": ("c", "ii"),
52
+ "cong": ("c", "ong"),
53
+ "cou": ("c", "ou"),
54
+ "cu": ("c", "u"),
55
+ "cuan": ("c", "uan"),
56
+ "cui": ("c", "uei"),
57
+ "cun": ("c", "uen"),
58
+ "cuo": ("c", "uo"),
59
+ "da": ("d", "a"),
60
+ "dai": ("d", "ai"),
61
+ "dan": ("d", "an"),
62
+ "dang": ("d", "ang"),
63
+ "dao": ("d", "ao"),
64
+ "de": ("d", "e"),
65
+ "dei": ("d", "ei"),
66
+ "den": ("d", "en"),
67
+ "deng": ("d", "eng"),
68
+ "di": ("d", "i"),
69
+ "dia": ("d", "ia"),
70
+ "dian": ("d", "ian"),
71
+ "diao": ("d", "iao"),
72
+ "die": ("d", "ie"),
73
+ "ding": ("d", "ing"),
74
+ "diu": ("d", "iou"),
75
+ "dong": ("d", "ong"),
76
+ "dou": ("d", "ou"),
77
+ "du": ("d", "u"),
78
+ "duan": ("d", "uan"),
79
+ "dui": ("d", "uei"),
80
+ "dun": ("d", "uen"),
81
+ "duo": ("d", "uo"),
82
+ "e": ("^", "e"),
83
+ "ei": ("^", "ei"),
84
+ "en": ("^", "en"),
85
+ "ng": ("^", "en"),
86
+ "eng": ("^", "eng"),
87
+ "er": ("^", "er"),
88
+ "fa": ("f", "a"),
89
+ "fan": ("f", "an"),
90
+ "fang": ("f", "ang"),
91
+ "fei": ("f", "ei"),
92
+ "fen": ("f", "en"),
93
+ "feng": ("f", "eng"),
94
+ "fo": ("f", "o"),
95
+ "fou": ("f", "ou"),
96
+ "fu": ("f", "u"),
97
+ "ga": ("g", "a"),
98
+ "gai": ("g", "ai"),
99
+ "gan": ("g", "an"),
100
+ "gang": ("g", "ang"),
101
+ "gao": ("g", "ao"),
102
+ "ge": ("g", "e"),
103
+ "gei": ("g", "ei"),
104
+ "gen": ("g", "en"),
105
+ "geng": ("g", "eng"),
106
+ "gong": ("g", "ong"),
107
+ "gou": ("g", "ou"),
108
+ "gu": ("g", "u"),
109
+ "gua": ("g", "ua"),
110
+ "guai": ("g", "uai"),
111
+ "guan": ("g", "uan"),
112
+ "guang": ("g", "uang"),
113
+ "gui": ("g", "uei"),
114
+ "gun": ("g", "uen"),
115
+ "guo": ("g", "uo"),
116
+ "ha": ("h", "a"),
117
+ "hai": ("h", "ai"),
118
+ "han": ("h", "an"),
119
+ "hang": ("h", "ang"),
120
+ "hao": ("h", "ao"),
121
+ "he": ("h", "e"),
122
+ "hei": ("h", "ei"),
123
+ "hen": ("h", "en"),
124
+ "heng": ("h", "eng"),
125
+ "hong": ("h", "ong"),
126
+ "hou": ("h", "ou"),
127
+ "hu": ("h", "u"),
128
+ "hua": ("h", "ua"),
129
+ "huai": ("h", "uai"),
130
+ "huan": ("h", "uan"),
131
+ "huang": ("h", "uang"),
132
+ "hui": ("h", "uei"),
133
+ "hun": ("h", "uen"),
134
+ "huo": ("h", "uo"),
135
+ "ji": ("j", "i"),
136
+ "jia": ("j", "ia"),
137
+ "jian": ("j", "ian"),
138
+ "jiang": ("j", "iang"),
139
+ "jiao": ("j", "iao"),
140
+ "jie": ("j", "ie"),
141
+ "jin": ("j", "in"),
142
+ "jing": ("j", "ing"),
143
+ "jiong": ("j", "iong"),
144
+ "jiu": ("j", "iou"),
145
+ "ju": ("j", "v"),
146
+ "juan": ("j", "van"),
147
+ "jue": ("j", "ve"),
148
+ "jun": ("j", "vn"),
149
+ "ka": ("k", "a"),
150
+ "kai": ("k", "ai"),
151
+ "kan": ("k", "an"),
152
+ "kang": ("k", "ang"),
153
+ "kao": ("k", "ao"),
154
+ "ke": ("k", "e"),
155
+ "kei": ("k", "ei"),
156
+ "ken": ("k", "en"),
157
+ "keng": ("k", "eng"),
158
+ "kong": ("k", "ong"),
159
+ "kou": ("k", "ou"),
160
+ "ku": ("k", "u"),
161
+ "kua": ("k", "ua"),
162
+ "kuai": ("k", "uai"),
163
+ "kuan": ("k", "uan"),
164
+ "kuang": ("k", "uang"),
165
+ "kui": ("k", "uei"),
166
+ "kun": ("k", "uen"),
167
+ "kuo": ("k", "uo"),
168
+ "la": ("l", "a"),
169
+ "lai": ("l", "ai"),
170
+ "lan": ("l", "an"),
171
+ "lang": ("l", "ang"),
172
+ "lao": ("l", "ao"),
173
+ "le": ("l", "e"),
174
+ "lei": ("l", "ei"),
175
+ "leng": ("l", "eng"),
176
+ "li": ("l", "i"),
177
+ "lia": ("l", "ia"),
178
+ "lian": ("l", "ian"),
179
+ "liang": ("l", "iang"),
180
+ "liao": ("l", "iao"),
181
+ "lie": ("l", "ie"),
182
+ "lin": ("l", "in"),
183
+ "ling": ("l", "ing"),
184
+ "liu": ("l", "iou"),
185
+ "lo": ("l", "o"),
186
+ "long": ("l", "ong"),
187
+ "lou": ("l", "ou"),
188
+ "lu": ("l", "u"),
189
+ "lv": ("l", "v"),
190
+ "luan": ("l", "uan"),
191
+ "lve": ("l", "ve"),
192
+ "lue": ("l", "ve"),
193
+ "lun": ("l", "uen"),
194
+ "luo": ("l", "uo"),
195
+ "ma": ("m", "a"),
196
+ "mai": ("m", "ai"),
197
+ "man": ("m", "an"),
198
+ "mang": ("m", "ang"),
199
+ "mao": ("m", "ao"),
200
+ "me": ("m", "e"),
201
+ "mei": ("m", "ei"),
202
+ "men": ("m", "en"),
203
+ "meng": ("m", "eng"),
204
+ "mi": ("m", "i"),
205
+ "mian": ("m", "ian"),
206
+ "miao": ("m", "iao"),
207
+ "mie": ("m", "ie"),
208
+ "min": ("m", "in"),
209
+ "ming": ("m", "ing"),
210
+ "miu": ("m", "iou"),
211
+ "mo": ("m", "o"),
212
+ "mou": ("m", "ou"),
213
+ "mu": ("m", "u"),
214
+ "na": ("n", "a"),
215
+ "nai": ("n", "ai"),
216
+ "nan": ("n", "an"),
217
+ "nang": ("n", "ang"),
218
+ "nao": ("n", "ao"),
219
+ "ne": ("n", "e"),
220
+ "nei": ("n", "ei"),
221
+ "nen": ("n", "en"),
222
+ "neng": ("n", "eng"),
223
+ "ni": ("n", "i"),
224
+ "nia": ("n", "ia"),
225
+ "nian": ("n", "ian"),
226
+ "niang": ("n", "iang"),
227
+ "niao": ("n", "iao"),
228
+ "nie": ("n", "ie"),
229
+ "nin": ("n", "in"),
230
+ "ning": ("n", "ing"),
231
+ "niu": ("n", "iou"),
232
+ "nong": ("n", "ong"),
233
+ "nou": ("n", "ou"),
234
+ "nu": ("n", "u"),
235
+ "nv": ("n", "v"),
236
+ "nuan": ("n", "uan"),
237
+ "nve": ("n", "ve"),
238
+ "nue": ("n", "ve"),
239
+ "nuo": ("n", "uo"),
240
+ "o": ("^", "o"),
241
+ "ou": ("^", "ou"),
242
+ "pa": ("p", "a"),
243
+ "pai": ("p", "ai"),
244
+ "pan": ("p", "an"),
245
+ "pang": ("p", "ang"),
246
+ "pao": ("p", "ao"),
247
+ "pe": ("p", "e"),
248
+ "pei": ("p", "ei"),
249
+ "pen": ("p", "en"),
250
+ "peng": ("p", "eng"),
251
+ "pi": ("p", "i"),
252
+ "pian": ("p", "ian"),
253
+ "piao": ("p", "iao"),
254
+ "pie": ("p", "ie"),
255
+ "pin": ("p", "in"),
256
+ "ping": ("p", "ing"),
257
+ "po": ("p", "o"),
258
+ "pou": ("p", "ou"),
259
+ "pu": ("p", "u"),
260
+ "qi": ("q", "i"),
261
+ "qia": ("q", "ia"),
262
+ "qian": ("q", "ian"),
263
+ "qiang": ("q", "iang"),
264
+ "qiao": ("q", "iao"),
265
+ "qie": ("q", "ie"),
266
+ "qin": ("q", "in"),
267
+ "qing": ("q", "ing"),
268
+ "qiong": ("q", "iong"),
269
+ "qiu": ("q", "iou"),
270
+ "qu": ("q", "v"),
271
+ "quan": ("q", "van"),
272
+ "que": ("q", "ve"),
273
+ "qun": ("q", "vn"),
274
+ "ran": ("r", "an"),
275
+ "rang": ("r", "ang"),
276
+ "rao": ("r", "ao"),
277
+ "re": ("r", "e"),
278
+ "ren": ("r", "en"),
279
+ "reng": ("r", "eng"),
280
+ "ri": ("r", "iii"),
281
+ "rong": ("r", "ong"),
282
+ "rou": ("r", "ou"),
283
+ "ru": ("r", "u"),
284
+ "rua": ("r", "ua"),
285
+ "ruan": ("r", "uan"),
286
+ "rui": ("r", "uei"),
287
+ "run": ("r", "uen"),
288
+ "ruo": ("r", "uo"),
289
+ "sa": ("s", "a"),
290
+ "sai": ("s", "ai"),
291
+ "san": ("s", "an"),
292
+ "sang": ("s", "ang"),
293
+ "sao": ("s", "ao"),
294
+ "se": ("s", "e"),
295
+ "sen": ("s", "en"),
296
+ "seng": ("s", "eng"),
297
+ "sha": ("sh", "a"),
298
+ "shai": ("sh", "ai"),
299
+ "shan": ("sh", "an"),
300
+ "shang": ("sh", "ang"),
301
+ "shao": ("sh", "ao"),
302
+ "she": ("sh", "e"),
303
+ "shei": ("sh", "ei"),
304
+ "shen": ("sh", "en"),
305
+ "sheng": ("sh", "eng"),
306
+ "shi": ("sh", "iii"),
307
+ "shou": ("sh", "ou"),
308
+ "shu": ("sh", "u"),
309
+ "shua": ("sh", "ua"),
310
+ "shuai": ("sh", "uai"),
311
+ "shuan": ("sh", "uan"),
312
+ "shuang": ("sh", "uang"),
313
+ "shui": ("sh", "uei"),
314
+ "shun": ("sh", "uen"),
315
+ "shuo": ("sh", "uo"),
316
+ "si": ("s", "ii"),
317
+ "song": ("s", "ong"),
318
+ "sou": ("s", "ou"),
319
+ "su": ("s", "u"),
320
+ "suan": ("s", "uan"),
321
+ "sui": ("s", "uei"),
322
+ "sun": ("s", "uen"),
323
+ "suo": ("s", "uo"),
324
+ "ta": ("t", "a"),
325
+ "tai": ("t", "ai"),
326
+ "tan": ("t", "an"),
327
+ "tang": ("t", "ang"),
328
+ "tao": ("t", "ao"),
329
+ "te": ("t", "e"),
330
+ "tei": ("t", "ei"),
331
+ "teng": ("t", "eng"),
332
+ "ti": ("t", "i"),
333
+ "tian": ("t", "ian"),
334
+ "tiao": ("t", "iao"),
335
+ "tie": ("t", "ie"),
336
+ "ting": ("t", "ing"),
337
+ "tong": ("t", "ong"),
338
+ "tou": ("t", "ou"),
339
+ "tu": ("t", "u"),
340
+ "tuan": ("t", "uan"),
341
+ "tui": ("t", "uei"),
342
+ "tun": ("t", "uen"),
343
+ "tuo": ("t", "uo"),
344
+ "wa": ("^", "ua"),
345
+ "wai": ("^", "uai"),
346
+ "wan": ("^", "uan"),
347
+ "wang": ("^", "uang"),
348
+ "wei": ("^", "uei"),
349
+ "wen": ("^", "uen"),
350
+ "weng": ("^", "ueng"),
351
+ "wo": ("^", "uo"),
352
+ "wu": ("^", "u"),
353
+ "xi": ("x", "i"),
354
+ "xia": ("x", "ia"),
355
+ "xian": ("x", "ian"),
356
+ "xiang": ("x", "iang"),
357
+ "xiao": ("x", "iao"),
358
+ "xie": ("x", "ie"),
359
+ "xin": ("x", "in"),
360
+ "xing": ("x", "ing"),
361
+ "xiong": ("x", "iong"),
362
+ "xiu": ("x", "iou"),
363
+ "xu": ("x", "v"),
364
+ "xuan": ("x", "van"),
365
+ "xue": ("x", "ve"),
366
+ "xun": ("x", "vn"),
367
+ "ya": ("^", "ia"),
368
+ "yan": ("^", "ian"),
369
+ "yang": ("^", "iang"),
370
+ "yao": ("^", "iao"),
371
+ "ye": ("^", "ie"),
372
+ "yi": ("^", "i"),
373
+ "yin": ("^", "in"),
374
+ "ying": ("^", "ing"),
375
+ "yo": ("^", "iou"),
376
+ "yong": ("^", "iong"),
377
+ "you": ("^", "iou"),
378
+ "yu": ("^", "v"),
379
+ "yuan": ("^", "van"),
380
+ "yue": ("^", "ve"),
381
+ "yun": ("^", "vn"),
382
+ "za": ("z", "a"),
383
+ "zai": ("z", "ai"),
384
+ "zan": ("z", "an"),
385
+ "zang": ("z", "ang"),
386
+ "zao": ("z", "ao"),
387
+ "ze": ("z", "e"),
388
+ "zei": ("z", "ei"),
389
+ "zen": ("z", "en"),
390
+ "zeng": ("z", "eng"),
391
+ "zha": ("zh", "a"),
392
+ "zhai": ("zh", "ai"),
393
+ "zhan": ("zh", "an"),
394
+ "zhang": ("zh", "ang"),
395
+ "zhao": ("zh", "ao"),
396
+ "zhe": ("zh", "e"),
397
+ "zhei": ("zh", "ei"),
398
+ "zhen": ("zh", "en"),
399
+ "zheng": ("zh", "eng"),
400
+ "zhi": ("zh", "iii"),
401
+ "zhong": ("zh", "ong"),
402
+ "zhou": ("zh", "ou"),
403
+ "zhu": ("zh", "u"),
404
+ "zhua": ("zh", "ua"),
405
+ "zhuai": ("zh", "uai"),
406
+ "zhuan": ("zh", "uan"),
407
+ "zhuang": ("zh", "uang"),
408
+ "zhui": ("zh", "uei"),
409
+ "zhun": ("zh", "uen"),
410
+ "zhuo": ("zh", "uo"),
411
+ "zi": ("z", "ii"),
412
+ "zong": ("z", "ong"),
413
+ "zou": ("z", "ou"),
414
+ "zu": ("z", "u"),
415
+ "zuan": ("z", "uan"),
416
+ "zui": ("z", "uei"),
417
+ "zun": ("z", "uen"),
418
+ "zuo": ("z", "uo"),
419
+ }
lemas_tts/infer/text_norm/tokenizer.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cp from https://github.com/lifeiteng/vall-e/blob/main/valle/data/tokenizer.py
2
+ # Copyright 2023 (authors: Feiteng Li)
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import os
17
+ import re, logging
18
+ from dataclasses import asdict, dataclass
19
+ from typing import Any, Dict, List, Optional, Pattern, Union
20
+ import math
21
+ import numpy as np
22
+ import torch
23
+ import torchaudio
24
+ # from lhotse.features import FeatureExtractor
25
+ # from lhotse.utils import Seconds, compute_num_frames
26
+ from phonemizer.backend.espeak.wrapper import EspeakWrapper
27
+ from phonemizer.backend import EspeakBackend
28
+ from phonemizer.backend.espeak.language_switch import LanguageSwitch
29
+ from phonemizer.backend.espeak.words_mismatch import WordMismatch
30
+ from phonemizer.punctuation import Punctuation
31
+ from phonemizer.separator import Separator
32
+
33
+ # Configure espeak-ng via espeakng_loader if available.
34
+ # This provides a consistent libespeak-ng + data across environments (e.g. HF Spaces).
35
+ try:
36
+ import espeakng_loader
37
+
38
+ EspeakWrapper.set_library(espeakng_loader.get_library_path())
39
+ data_path = espeakng_loader.get_data_path()
40
+ # Export data path via environment so underlying espeak-ng uses it.
41
+ os.environ["ESPEAK_DATA_PATH"] = data_path
42
+ os.environ["ESPEAKNG_DATA_PATH"] = data_path
43
+ print("[LEMAS-TTS] espeak-ng configured via espeakng_loader")
44
+ except Exception as e: # ImportError or runtime errors
45
+ # Fall back to system espeak-ng discovery.
46
+ print(f"[LEMAS-TTS] espeakng_loader not available or failed ({e}); using system espeak-ng")
47
+
48
+
49
+ class TextTokenizer:
50
+ """Phonemize Text."""
51
+
52
+ def __init__(
53
+ self,
54
+ language="en-us",
55
+ backend="espeak",
56
+ separator=Separator(word="_", syllable="-", phone="|"),
57
+ preserve_punctuation=True,
58
+ punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(),
59
+ with_stress: bool = False,
60
+ tie: Union[bool, str] = False,
61
+ language_switch: LanguageSwitch = "keep-flags",
62
+ words_mismatch: WordMismatch = "ignore",
63
+ ) -> None:
64
+ phonemizer = EspeakBackend(
65
+ language,
66
+ punctuation_marks=punctuation_marks,
67
+ preserve_punctuation=preserve_punctuation,
68
+ with_stress=with_stress,
69
+ tie=tie,
70
+ language_switch=language_switch,
71
+ words_mismatch=words_mismatch,
72
+ )
73
+
74
+ self.backend = phonemizer
75
+ self.separator = separator
76
+
77
+ def to_list(self, phonemized: str) -> List[str]:
78
+ fields = []
79
+ for word in phonemized.split(self.separator.word):
80
+ # "ɐ m|iː|n?" ɹ|ɪ|z|ɜː|v; h|ɪ|z.
81
+ pp = re.findall(r"\w+|[^\w\s]", word, re.UNICODE)
82
+ fields.extend(
83
+ [p for p in pp if p != self.separator.phone]
84
+ + [self.separator.word]
85
+ )
86
+ assert len("".join(fields[:-1])) == len(phonemized) - phonemized.count(
87
+ self.separator.phone
88
+ )
89
+ return fields[:-1]
90
+
91
+ def __call__(self, text, strip=True) -> List[List[str]]:
92
+ if isinstance(text, str):
93
+ text = [text]
94
+ phones = []
95
+ for txt in text:
96
+ if txt == '':
97
+ continue
98
+ if txt[0] == '#':
99
+ phones.append(txt)
100
+ else:
101
+ ipa = text_tokenizer.backend.phonemize([txt], separator=text_tokenizer.separator, strip=True, njobs=1, logger=logging.basicConfig(level=logging.ERROR))
102
+ phones += text_tokenizer.to_list(ipa[0])
103
+ return phones
104
+
105
+
106
+ def tokenize_text(tokenizer: TextTokenizer, text: str) -> List[str]:
107
+ phonemes = tokenizer([text.strip()])
108
+ return phonemes[0] # k2symbols
109
+
110
+
111
+ _PAUSE_SYMBOL = {'、':',', ',':',', '。':',', '!':'!', '?':'?', ':':':'}
112
+ def _replace(match):
113
+ word = match.group(0)
114
+ return _PAUSE_SYMBOL[word]
115
+
116
+ def txt2phone(tokenizer: TextTokenizer, text: str):
117
+ text = re.sub('|'.join(_PAUSE_SYMBOL.keys()), _replace, text)
118
+ text = re.split(r"(#\d)", text)
119
+ phones = []
120
+ for txt in text:
121
+ if txt == '':
122
+ continue
123
+ if txt[0] == '#':
124
+ phones.append(txt)
125
+ else:
126
+ ipa = tokenizer.backend.phonemize([txt], separator=tokenizer.separator, strip=True, njobs=1)
127
+ phones += tokenizer.to_list(ipa[0])
128
+ phones = "|".join(phones).replace("(|", "(").replace("|)", ")")
129
+ # phones = ["(cmn)"] + phones.split("|")
130
+ return phones
131
+
132
+
133
+ def convert_audio(wav: torch.Tensor, sr: int, target_sr: int, target_channels: int):
134
+ assert wav.shape[0] in [1, 2], "Audio must be mono or stereo."
135
+ if target_channels == 1:
136
+ wav = wav.mean(0, keepdim=True)
137
+ elif target_channels == 2:
138
+ *shape, _, length = wav.shape
139
+ wav = wav.expand(*shape, target_channels, length)
140
+ elif wav.shape[0] == 1:
141
+ wav = wav.expand(target_channels, -1)
142
+ wav = torchaudio.transforms.Resample(sr, target_sr)(wav)
143
+ return wav
144
+
145
+
146
+ class AudioTokenizer:
147
+ """EnCodec audio."""
148
+
149
+ def __init__(
150
+ self,
151
+ device: Any = None,
152
+ signature = None
153
+ ) -> None:
154
+ from audiocraft.solvers import CompressionSolver
155
+ model = CompressionSolver.model_from_checkpoint(signature)
156
+ self.sample_rate = model.sample_rate
157
+ self.channels = model.channels
158
+
159
+ if not device:
160
+ device = torch.device("cpu")
161
+ if torch.cuda.is_available():
162
+ device = torch.device("cuda:0")
163
+
164
+ self._device = device
165
+
166
+ self.codec = model.to(device)
167
+
168
+ @property
169
+ def device(self):
170
+ return self._device
171
+
172
+ def encode(self, wav: torch.Tensor) -> torch.Tensor:
173
+ codes = self.codec.encode(wav.to(self.device))
174
+ return [(codes[0], None)]
175
+
176
+ def decode(self, frames: torch.Tensor) -> torch.Tensor:
177
+ frames = frames[0][0] # [1,4,T]
178
+ return self.codec.decode(frames)
179
+
180
+
181
+
182
+ def tokenize_audio(tokenizer: AudioTokenizer, audio, offset = -1, num_frames=-1):
183
+ # Load and pre-process the audio waveform
184
+ if type(audio) == str:
185
+ if offset != -1 and num_frames!=-1:
186
+ wav, sr = torchaudio.load(audio, frame_offset=offset, num_frames=num_frames)
187
+ else:
188
+ wav, sr = torchaudio.load(audio)
189
+ wav = convert_audio(wav, sr, tokenizer.sample_rate, tokenizer.channels)
190
+ wav = wav.unsqueeze(0)
191
+ else:
192
+ wav = audio.unsqueeze(0).unsqueeze(0)
193
+ # Extract discrete codes from EnCodec
194
+ with torch.no_grad():
195
+ encoded_frames = tokenizer.encode(wav)
196
+ return encoded_frames
197
+
198
+
199
+ class AudioSR:
200
+ """EnCodec audio."""
201
+
202
+ def __init__(
203
+ self,
204
+ model_path,
205
+ device = "cpu",
206
+ ) -> None:
207
+ import dac
208
+ self.codec = dac.DAC.load(model_path)
209
+ self.codec.to(device)
210
+ self.codec.eval()
211
+
212
+ self.sample_rate = self.codec.sample_rate
213
+ self.channels = 1
214
+ self._device = device
215
+
216
+ @property
217
+ def device(self):
218
+ return self._device
219
+
220
+ def encode(self, wav: torch.Tensor) -> torch.Tensor:
221
+ length = wav.shape[-1]
222
+ right_pad = math.ceil(length / self.codec.hop_length) * self.codec.hop_length - length
223
+ wav = torch.nn.functional.pad(wav, (0, right_pad))
224
+ z, codes, _, _, _ = self.codec.encode(wav.to(self._device))
225
+ return [(codes, z)]
226
+
227
+ def decode(self, frames: torch.Tensor) -> torch.Tensor:
228
+ # frames = frames[0][0] # [1,4,T]
229
+ # with torch.no_grad():
230
+ # z = self.codec.quantizer.from_codes(frames)[0]
231
+ # y = self.codec.decode(z)
232
+ z = frames[0][1] # [1, 2048, T]
233
+ with torch.no_grad():
234
+ y = self.codec.decode(z)
235
+ return y
lemas_tts/infer/text_norm/txt2pinyin.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import multiprocessing
2
+ from concurrent.futures import ProcessPoolExecutor
3
+ import argparse
4
+ import os, sys, re
5
+ from random import shuffle
6
+ from tqdm import tqdm
7
+ from pypinyin import Style
8
+ from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
9
+ from pypinyin.converter import DefaultConverter
10
+ from pypinyin.core import Pinyin
11
+ import jieba
12
+ jieba.set_dictionary(dictionary_path=os.path.join(os.path.dirname(__file__)+'/jieba_dict.txt'))
13
+
14
+ from .symbols import pinyin_dict
15
+ from .cn_tn import NSWNormalizer
16
+
17
+
18
+ zh_pattern = re.compile("[\u4e00-\u9fa5]")
19
+ alpha_pattern = re.compile(r"[a-zA-Z]")
20
+
21
+ def is_zh(word):
22
+ global zh_pattern
23
+ match = zh_pattern.search(word)
24
+ return match is not None
25
+
26
+ def is_alpha(word):
27
+ global alpha_pattern
28
+ match = alpha_pattern.search(word)
29
+ return match is not None
30
+
31
+ def get_phoneme_from_char_and_pinyin(chn_char, pinyin):
32
+ # we do not need #4, use sil to replace it
33
+ chn_char = chn_char.replace("#4", "")
34
+ char_len = len(chn_char)
35
+ i, j = 0, 0
36
+ result = []
37
+ # print(pinyin)
38
+ while i < char_len:
39
+ cur_char = chn_char[i]
40
+ if is_zh(cur_char):
41
+ if pinyin[j][:-1] == 'n': # 处理特殊“嗯” 特殊拼音
42
+ pinyin[j] = 'en' + pinyin[j][-1]
43
+ if i < len(chn_char)-2 and is_zh(chn_char[i:i+3]) and pinyin[j][-1] == pinyin[j+1][-1] == pinyin[j+2][-1] == '3': # 处理连续三个三声变调
44
+ pinyin[j+1] = pinyin[j+1][:-1] + '2'
45
+ # print(chn_char[i:i+3], pinyin[j:j+3])
46
+ if i < len(chn_char)-1 and pinyin[j][:-1] in pinyin_dict and is_zh(chn_char[i]) and is_zh(chn_char[i+1]) and pinyin[j][-1] == pinyin[j+1][-1] == '3': # 处理连续两个三声变调
47
+ pinyin[j] = pinyin[j][:-1] + '2'
48
+ # print('change tone ', chn_char[i:i+2], pinyin[j:j + 2])
49
+ if pinyin[j][:-1] not in pinyin_dict: # 处理儿化音
50
+ assert chn_char[i + 1] == "儿", f"current_char : {cur_char}, next_char: {chn_char[i+1]}, cur_pinyin: {pinyin[j]}"
51
+ assert pinyin[j][-2] == "r"
52
+ tone = pinyin[j][-1]
53
+ a = pinyin[j][:-2]
54
+ # a1, a2 = pinyin_dict[a]
55
+ # result += [a1, a2 + tone, "er5"]
56
+ result += [a + tone, er5]
57
+ if i + 2 < char_len and chn_char[i + 2] != "#":
58
+ result.append("#0")
59
+ i += 2
60
+ j += 1
61
+ else:
62
+ tone = pinyin[j][-1]
63
+ a = pinyin[j][:-1]
64
+ a1, a2 = pinyin_dict[a] # a="wen" a1="^", a2="en"
65
+ # result += [a1, a2 + tone] # result = [zh, ong1, ^,en2]
66
+ result.append(a+tone)
67
+ # if i + 1 < char_len and chn_char[i + 1] != "#": # 每个字后面接一个#0
68
+ # result.append("#0")
69
+
70
+ i += 1
71
+ j += 1
72
+
73
+ # TODO support English alpha
74
+ # elif is_alpha(cur_char):
75
+ # result += ALPHA_PHONE_DICT[cur_char.upper()]
76
+ # if i + 1 < char_len and chn_char[i + 1] not in "#、,。!?:" : # 每个字后面接一个#0
77
+ # result.append("#0")
78
+ # i += 1
79
+ # j += 1 # baker alpha dataset "ABC" in pinyin
80
+ elif cur_char == "#":
81
+ result.append(chn_char[i : i + 2])
82
+ i += 2
83
+ elif cur_char in _PAUSE_SYMBOL: # 遇到标点符号,添加停顿
84
+ result.pop() # 去掉#0
85
+ result.append("#3")
86
+ i += 1
87
+ else:
88
+ # ignore the unknown char
89
+ # result.append(chn_char[i])
90
+ i += 1
91
+ if result[-1] == "#0": # 去掉最后的#0,改为sil
92
+ result = result[:-1]
93
+ # if result[-1] != "sil":
94
+ # result.append("sil")
95
+ assert j == len(pinyin)
96
+ return result
97
+
98
+ # _PAUSE_SYMBOL = {'、', ',', '。', ',', '!', '!', '?', ':', ':', '《', '》', '·', '(', ')', '(', ')'}
99
+ _PAUSE_SYMBOL = {'.':'.', '、':',', ',':',', '。':'.', ',':',', '!':'!', '!':'!', '?':'?', '?':'?', ':':',', ':':',', '——':','}
100
+
101
+ class MyConverter(NeutralToneWith5Mixin, DefaultConverter):
102
+ pass
103
+
104
+
105
+ def checkErHuaYin(text, GT_pinyin):
106
+ new_pinyin = []
107
+ check_pattern = re.compile("[\\t\.\!\?\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()“”:;]+")
108
+ check_text = check_pattern.sub('', text)
109
+ if len(check_text) > len(GT_pinyin) and '儿' in check_text:
110
+ # print('Size mismatch: ', check_text, len(check_text), '\n', GT_pinyin, len(GT_pinyin))
111
+ for i in range(len(GT_pinyin)):
112
+ if GT_pinyin[i][-2] == 'r' and GT_pinyin[i][:2] != 'er' and check_text[i + 1] == '儿':
113
+ new_pinyin.append(GT_pinyin[i][:-2] + GT_pinyin[i][-1])
114
+ new_pinyin.append('er5')
115
+ replace_word = check_text[i:i + 2]
116
+ replace_pattern = re.compile(replace_word)
117
+ # text = replace_pattern.sub(replace_word[:-1], text)
118
+ check_text = replace_pattern.sub(replace_word[:-1], check_text, count=1)
119
+ else:
120
+ new_pinyin.append(GT_pinyin[i])
121
+ GT_pinyin = new_pinyin
122
+ return GT_pinyin
123
+
124
+
125
+ def change_tone_in_bu_or_yi(chars, pinyin_list):
126
+ location_yi = [m.start() for m in re.finditer(r'一', chars)]
127
+ location_bu = [m.start() for m in re.finditer(r'不', chars)]
128
+ # print('data: ', chars, pinyin_list, location_yi, location_bu)
129
+ for l in location_yi:
130
+ if l > 0 and l<len(chars) and chars[l-1]==chars[l+1]:
131
+ pinyin_list[l] = 'yi5'
132
+ elif l<len(chars) and pinyin_list[l+1][-1] == '4':
133
+ pinyin_list[l] = 'yi2'
134
+ for l in location_bu:
135
+ if l<len(chars) and pinyin_list[l+1][-1] == '4':
136
+ pinyin_list[l] = 'bu2'
137
+ return pinyin_list
138
+
139
+
140
+ def txt2pinyin(text, pinyin_parser):
141
+ phonemes = []
142
+ text = NSWNormalizer(text.strip()).normalize().upper()
143
+ texts = text.split(' ')
144
+ for text in texts:
145
+ text_list = list(jieba.cut(text))
146
+ for words in text_list:
147
+ # print('words: ', words)
148
+ if words in _PAUSE_SYMBOL:
149
+ # phonemes.append('#2')
150
+ phonemes[-1] += _PAUSE_SYMBOL[words]
151
+ elif re.search("[\u4e00-\u9fa5]+", words):
152
+ pinyin = pinyin_parser(words, style=Style.TONE3, errors="ignore")
153
+ new_pinyin = []
154
+ for x in pinyin:
155
+ x = "".join(x)
156
+ if "#" not in x:
157
+ new_pinyin.append(x)
158
+ new_pinyin = change_tone_in_bu_or_yi(words, new_pinyin) if len(words)>1 and words[-1] not in {"一","不"} else new_pinyin
159
+ phoneme = get_phoneme_from_char_and_pinyin(words, new_pinyin) # phoneme seq: [sil c e4 #0 sh iii4 #0 ^ uen2 #0 b en3 sil] string 的list
160
+ phonemes += phoneme
161
+ elif re.search(r"[a-zA-Z]", words):
162
+ phonemes.append(words.upper())
163
+ # phonemes.append("#1")
164
+ phones = " ".join(phonemes)
165
+ return phones
166
+
167
+
168
+
169
+ def process_batch(text_list, save_dir):
170
+ my_pinyin = Pinyin(MyConverter())
171
+ pinyin_parser = my_pinyin.pinyin
172
+
173
+ for text_info in tqdm(text_list):
174
+ try:
175
+ name, text = text_info
176
+ save_path = os.path.join(save_dir, name+".txt")
177
+ phones = txt2pinyin(text, pinyin_parser)
178
+ open(save_path, 'w', encoding='utf-8').write(phones)
179
+ except Exception as e:
180
+ print(text_info, e)
181
+
182
+ def parallel_process(filenames, num_processes, save_dir):
183
+ with ProcessPoolExecutor(max_workers=num_processes) as executor:
184
+ tasks = []
185
+ for i in range(num_processes):
186
+ start = int(i * len(filenames) / num_processes)
187
+ end = int((i + 1) * len(filenames) / num_processes)
188
+ chunk = filenames[start:end]
189
+ tasks.append(executor.submit(process_batch, chunk, save_dir))
190
+
191
+ for task in tqdm(tasks):
192
+ task.result()
193
+
194
+
195
+ if __name__ == "__main__":
196
+ parser = argparse.ArgumentParser()
197
+ parser.add_argument(
198
+ "--text_file", type=str, default="", help="path to input text file")
199
+ parser.add_argument(
200
+ "--save_dir", type=str, default="", help="path to output text file")
201
+ parser.add_argument(
202
+ '--workers', type=int, default=4, help='You are advised to set the number of processes to the same as the number of CPU cores')
203
+ args = parser.parse_args()
204
+
205
+ sampling_rate = 16000
206
+
207
+ os.makedirs(args.save_dir, exist_ok=True)
208
+
209
+ filenames = open(args.text_file, 'r', encoding='utf-8').readlines()
210
+ filenames = [x.strip().split('\t') for x in tqdm(filenames)]
211
+ filenames = [[x[0], x[-1]] for x in tqdm(filenames)]
212
+ # shuffle(filenames)
213
+ print(len(filenames))
214
+ multiprocessing.set_start_method("spawn", force=True)
215
+
216
+ if args.workers == 0:
217
+ args.workers = os.cpu_count()
218
+
219
+ parallel_process(filenames, args.workers, args.save_dir)
220
+
221
+
222
+ #################################################################################
223
+
224
+
225
+
lemas_tts/infer/utils_infer.py ADDED
@@ -0,0 +1,661 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # A unified script for inference process
2
+ # Make adjustments inside functions, and consider both gradio and cli scripts if need to change func output format
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+ from concurrent.futures import ThreadPoolExecutor
7
+
8
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # for MPS device compatibility
9
+ sys.path.append(f"{os.path.dirname(os.path.abspath(__file__))}/../../third_party/BigVGAN/")
10
+
11
+ import hashlib
12
+ import re
13
+ import tempfile
14
+ from importlib.resources import files
15
+
16
+ import matplotlib
17
+
18
+ matplotlib.use("Agg")
19
+
20
+ import matplotlib.pylab as plt
21
+ import numpy as np
22
+ import torch
23
+ import torchaudio
24
+ import tqdm
25
+ from huggingface_hub import hf_hub_download
26
+ from pydub import AudioSegment, silence
27
+ from transformers import pipeline
28
+ from vocos import Vocos
29
+
30
+ from lemas_tts.model.cfm import CFM
31
+ from lemas_tts.model.utils import (
32
+ get_tokenizer,
33
+ convert_char_to_pinyin,
34
+ )
35
+
36
+
37
+ def _find_repo_root(start: Path) -> Path:
38
+ """Locate the repo root by looking for a `pretrained_models` folder upwards."""
39
+ for p in [start, *start.parents]:
40
+ if (p / "pretrained_models").is_dir():
41
+ return p
42
+ cwd = Path.cwd()
43
+ if (cwd / "pretrained_models").is_dir():
44
+ return cwd
45
+ return start
46
+
47
+
48
+ # Resolve repository layout for pretrained assets when running from source tree
49
+ THIS_FILE = Path(__file__).resolve()
50
+ REPO_ROOT = _find_repo_root(THIS_FILE)
51
+ PRETRAINED_ROOT = REPO_ROOT / "pretrained_models"
52
+ CKPTS_ROOT = PRETRAINED_ROOT / "ckpts"
53
+
54
+ _ref_audio_cache = {}
55
+
56
+ device = (
57
+ "cuda"
58
+ if torch.cuda.is_available()
59
+ else "xpu"
60
+ if torch.xpu.is_available()
61
+ else "mps"
62
+ if torch.backends.mps.is_available()
63
+ else "cpu"
64
+ )
65
+
66
+ # -----------------------------------------
67
+
68
+ target_sample_rate = 24000
69
+ n_mel_channels = 100
70
+ hop_length = 256
71
+ win_length = 1024
72
+ n_fft = 1024
73
+ mel_spec_type = "vocos"
74
+ target_rms = 0.1
75
+ cross_fade_duration = 0.15
76
+ ode_method = "euler"
77
+ nfe_step = 32 # 16, 32
78
+ cfg_strength = 3.0
79
+ sway_sampling_coef = 1
80
+ speed = 1.0
81
+ fix_duration = None
82
+
83
+ # -----------------------------------------
84
+
85
+
86
+ # chunk text into smaller pieces
87
+
88
+
89
+ def chunk_text(text, max_chars=135):
90
+ """
91
+ Splits the input text into chunks, each with a maximum number of characters.
92
+
93
+ Args:
94
+ text (str): The text to be split.
95
+ max_chars (int): The maximum number of characters per chunk.
96
+
97
+ Returns:
98
+ List[str]: A list of text chunks.
99
+ """
100
+ chunks = []
101
+ current_chunk = ""
102
+ # Split the text into sentences based on punctuation followed by whitespace
103
+ sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[;:,。!?])", text)
104
+
105
+ for sentence in sentences:
106
+ if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
107
+ current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
108
+ else:
109
+ if current_chunk:
110
+ chunks.append(current_chunk.strip())
111
+ current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
112
+
113
+ if current_chunk:
114
+ chunks.append(current_chunk.strip())
115
+
116
+ return chunks
117
+
118
+
119
+ # load vocoder
120
+ def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device=device, hf_cache_dir=None):
121
+ if vocoder_name == "vocos":
122
+ # vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
123
+ if is_local:
124
+ print(f"Load vocos from local path {local_path}")
125
+ config_path = f"{local_path}/config.yaml"
126
+ model_path = f"{local_path}/pytorch_model.bin"
127
+ else:
128
+ print("Download Vocos from huggingface charactr/vocos-mel-24khz")
129
+ repo_id = "charactr/vocos-mel-24khz"
130
+ config_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml")
131
+ model_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin")
132
+ vocoder = Vocos.from_hparams(config_path)
133
+ state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
134
+ from vocos.feature_extractors import EncodecFeatures
135
+
136
+ if isinstance(vocoder.feature_extractor, EncodecFeatures):
137
+ encodec_parameters = {
138
+ "feature_extractor.encodec." + key: value
139
+ for key, value in vocoder.feature_extractor.encodec.state_dict().items()
140
+ }
141
+ state_dict.update(encodec_parameters)
142
+ vocoder.load_state_dict(state_dict)
143
+ vocoder = vocoder.eval().to(device)
144
+ elif vocoder_name == "bigvgan":
145
+ try:
146
+ from third_party.BigVGAN import bigvgan
147
+ except ImportError:
148
+ print("You need to follow the README to init submodule and change the BigVGAN source code.")
149
+ if is_local:
150
+ # download generator from https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x/tree/main
151
+ vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
152
+ else:
153
+ vocoder = bigvgan.BigVGAN.from_pretrained(
154
+ "nvidia/bigvgan_v2_24khz_100band_256x", use_cuda_kernel=False, cache_dir=hf_cache_dir
155
+ )
156
+
157
+ vocoder.remove_weight_norm()
158
+ vocoder = vocoder.eval().to(device)
159
+ return vocoder
160
+
161
+
162
+ # load asr pipeline
163
+
164
+ asr_pipe = None
165
+
166
+
167
+ def initialize_asr_pipeline(device: str = device, dtype=None):
168
+ if dtype is None:
169
+ dtype = (
170
+ torch.float16
171
+ if "cuda" in device
172
+ and torch.cuda.get_device_properties(device).major >= 7
173
+ and not torch.cuda.get_device_name().endswith("[ZLUDA]")
174
+ else torch.float32
175
+ )
176
+ global asr_pipe
177
+ asr_pipe = pipeline(
178
+ "automatic-speech-recognition",
179
+ model="openai/whisper-large-v3-turbo",
180
+ torch_dtype=dtype,
181
+ device=device,
182
+ )
183
+
184
+
185
+ # transcribe
186
+
187
+
188
+ def transcribe(ref_audio, language=None):
189
+ global asr_pipe
190
+ if asr_pipe is None:
191
+ initialize_asr_pipeline(device=device)
192
+ return asr_pipe(
193
+ ref_audio,
194
+ chunk_length_s=30,
195
+ batch_size=128,
196
+ generate_kwargs={"task": "transcribe", "language": language} if language else {"task": "transcribe"},
197
+ return_timestamps=False,
198
+ )["text"].strip()
199
+
200
+
201
+ # load model checkpoint for inference
202
+
203
+
204
+ def load_checkpoint(model, ckpt_path, device: str, dtype=None, use_ema=True):
205
+ if dtype is None:
206
+ dtype = (
207
+ torch.float16
208
+ if "cuda" in device
209
+ and torch.cuda.get_device_properties(device).major >= 7
210
+ and not torch.cuda.get_device_name().endswith("[ZLUDA]")
211
+ else torch.float32
212
+ )
213
+ model = model.to(dtype)
214
+
215
+ ckpt_type = ckpt_path.split(".")[-1]
216
+ if ckpt_type == "safetensors":
217
+ from safetensors.torch import load_file
218
+
219
+ checkpoint = load_file(ckpt_path, device=device)
220
+ else:
221
+ checkpoint = torch.load(ckpt_path, map_location=device, weights_only=True)
222
+
223
+ if use_ema:
224
+ if ckpt_type == "safetensors":
225
+ checkpoint = {"ema_model_state_dict": checkpoint}
226
+ checkpoint["model_state_dict"] = {
227
+ k.replace("ema_model.", ""): v
228
+ for k, v in checkpoint["ema_model_state_dict"].items()
229
+ if k not in ["initted", "step"]
230
+ }
231
+
232
+ # patch for backward compatibility, 305e3ea
233
+ for key in [
234
+ "mel_spec.mel_stft.mel_scale.fb",
235
+ "mel_spec.mel_stft.spectrogram.window",
236
+ "ctc.proj.0.weight",
237
+ "ctc.proj.0.bias",
238
+ "ctc.ctc_proj.weight",
239
+ "ctc.ctc_proj.bias",
240
+ ]:
241
+ if key in checkpoint["model_state_dict"]:
242
+ del checkpoint["model_state_dict"][key]
243
+
244
+ # use strict=False so newly added modules (e.g. prosody encoder)
245
+ # that are initialized from their own checkpoints do not cause
246
+ # missing-key errors when loading older CFM checkpoints
247
+ model.load_state_dict(checkpoint["model_state_dict"], strict=False)
248
+ else:
249
+ if ckpt_type == "safetensors":
250
+ checkpoint = {"model_state_dict": checkpoint}
251
+ model.load_state_dict(checkpoint["model_state_dict"], strict=False)
252
+
253
+ del checkpoint
254
+ torch.cuda.empty_cache()
255
+
256
+ return model.to(device)
257
+
258
+
259
+ # load model for inference
260
+
261
+
262
+ def load_model(
263
+ model_cls,
264
+ model_cfg,
265
+ ckpt_path,
266
+ mel_spec_type=mel_spec_type,
267
+ vocab_file="",
268
+ ode_method=ode_method,
269
+ use_ema=True,
270
+ device=device,
271
+ use_prosody_encoder=False,
272
+ prosody_cfg_path="",
273
+ prosody_ckpt_path="",
274
+ ):
275
+ if vocab_file == "":
276
+ vocab_file = str(files("lemas_tts").joinpath("infer/examples/vocab.txt"))
277
+ tokenizer = "custom"
278
+
279
+ print("\nvocab : ", vocab_file)
280
+ print("token : ", tokenizer)
281
+ print("model : ", ckpt_path, "\n")
282
+
283
+ vocab_char_map, vocab_size = get_tokenizer(vocab_file, tokenizer)
284
+
285
+ # Resolve prosody encoder assets if requested but paths not provided
286
+ if use_prosody_encoder:
287
+ if not prosody_cfg_path:
288
+ prosody_cfg_path = str(CKPTS_ROOT / "prosody_encoder" / "pretssel_cfg.json")
289
+ if not prosody_ckpt_path:
290
+ prosody_ckpt_path = str(CKPTS_ROOT / "prosody_encoder" / "prosody_encoder_UnitY2.pt")
291
+ model = CFM(
292
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels, use_prosody_encoder=use_prosody_encoder),
293
+ mel_spec_kwargs=dict(
294
+ n_fft=n_fft,
295
+ hop_length=hop_length,
296
+ win_length=win_length,
297
+ n_mel_channels=n_mel_channels,
298
+ target_sample_rate=target_sample_rate,
299
+ mel_spec_type=mel_spec_type,
300
+ ),
301
+ odeint_kwargs=dict(
302
+ method=ode_method,
303
+ ),
304
+ vocab_char_map=vocab_char_map,
305
+ use_prosody_encoder=use_prosody_encoder,
306
+ prosody_cfg_path=prosody_cfg_path,
307
+ prosody_ckpt_path=prosody_ckpt_path,
308
+ ).to(device)
309
+
310
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
311
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
312
+
313
+ return model
314
+
315
+
316
+ def remove_silence_edges(audio, silence_threshold=-42):
317
+ # Remove silence from the start
318
+ non_silent_start_idx = silence.detect_leading_silence(audio, silence_threshold=silence_threshold)
319
+ audio = audio[non_silent_start_idx:]
320
+
321
+ # Remove silence from the end
322
+ non_silent_end_duration = audio.duration_seconds
323
+ for ms in reversed(audio):
324
+ if ms.dBFS > silence_threshold:
325
+ break
326
+ non_silent_end_duration -= 0.001
327
+ trimmed_audio = audio[: int(non_silent_end_duration * 1000)]
328
+
329
+ return trimmed_audio
330
+
331
+
332
+ # preprocess reference audio and text
333
+
334
+
335
+ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_info=print):
336
+ show_info("Converting audio...")
337
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
338
+ aseg = AudioSegment.from_file(ref_audio_orig)
339
+
340
+ if clip_short:
341
+ # 1. try to find long silence for clipping
342
+ non_silent_segs = silence.split_on_silence(
343
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
344
+ )
345
+ non_silent_wave = AudioSegment.silent(duration=0)
346
+ for non_silent_seg in non_silent_segs:
347
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
348
+ show_info("Audio is over 12s, clipping short. (1)")
349
+ break
350
+ non_silent_wave += non_silent_seg
351
+
352
+ # 2. try to find short silence for clipping if 1. failed
353
+ if len(non_silent_wave) > 12000:
354
+ non_silent_segs = silence.split_on_silence(
355
+ aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
356
+ )
357
+ non_silent_wave = AudioSegment.silent(duration=0)
358
+ for non_silent_seg in non_silent_segs:
359
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 12000:
360
+ show_info("Audio is over 12s, clipping short. (2)")
361
+ break
362
+ non_silent_wave += non_silent_seg
363
+
364
+ aseg = non_silent_wave
365
+
366
+ # 3. if no proper silence found for clipping
367
+ if len(aseg) > 12000:
368
+ aseg = aseg[:12000]
369
+ show_info("Audio is over 12s, clipping short. (3)")
370
+
371
+ aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
372
+ aseg.export(f.name, format="wav")
373
+ ref_audio = f.name
374
+
375
+ # Compute a hash of the reference audio file
376
+ with open(ref_audio, "rb") as audio_file:
377
+ audio_data = audio_file.read()
378
+ audio_hash = hashlib.md5(audio_data).hexdigest()
379
+
380
+ if not ref_text.strip():
381
+ global _ref_audio_cache
382
+ if audio_hash in _ref_audio_cache:
383
+ # Use cached asr transcription
384
+ show_info("Using cached reference text...")
385
+ ref_text = _ref_audio_cache[audio_hash]
386
+ else:
387
+ show_info("No reference text provided, transcribing reference audio...")
388
+ ref_text = transcribe(ref_audio)
389
+ # Cache the transcribed text (not caching custom ref_text, enabling users to do manual tweak)
390
+ _ref_audio_cache[audio_hash] = ref_text
391
+ else:
392
+ show_info("Using custom reference text...")
393
+
394
+ # Ensure ref_text ends with a proper sentence-ending punctuation
395
+ if not ref_text.endswith(". ") and not ref_text.endswith("。"):
396
+ if ref_text.endswith("."):
397
+ ref_text += " "
398
+ else:
399
+ ref_text += ". "
400
+
401
+ print("\nref_text ", ref_text)
402
+
403
+ return ref_audio, ref_text
404
+
405
+
406
+ # infer process: chunk text -> infer batches [i.e. infer_batch_process()]
407
+
408
+
409
+ def infer_process(
410
+ ref_audio,
411
+ ref_text,
412
+ gen_text,
413
+ model_obj,
414
+ vocoder,
415
+ mel_spec_type=mel_spec_type,
416
+ show_info=print,
417
+ progress=tqdm,
418
+ target_rms=target_rms,
419
+ cross_fade_duration=cross_fade_duration,
420
+ nfe_step=nfe_step,
421
+ cfg_strength=cfg_strength,
422
+ sway_sampling_coef=sway_sampling_coef,
423
+ use_acc_grl=True,
424
+ use_prosody_encoder=True,
425
+ ref_ratio=None,
426
+ no_ref_audio=False,
427
+ speed=speed,
428
+ fix_duration=fix_duration,
429
+ device=device,
430
+ ):
431
+ # Split the input text into batches
432
+ audio, sr = torchaudio.load(ref_audio)
433
+
434
+ if type(ref_text) == str:
435
+ max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (22 - audio.shape[-1] / sr))
436
+ gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
437
+ else:
438
+ gen_text_batches = gen_text
439
+
440
+ print(f"ref_text:", ref_text)
441
+ for i, gen_text in enumerate(gen_text_batches):
442
+ print(f"gen_text {i}", gen_text)
443
+ print("\n")
444
+
445
+ show_info(f"Generating audio in {len(gen_text_batches)} batches...")
446
+ return next(
447
+ infer_batch_process(
448
+ (audio, sr),
449
+ ref_text,
450
+ gen_text_batches,
451
+ model_obj,
452
+ vocoder,
453
+ mel_spec_type=mel_spec_type,
454
+ progress=progress,
455
+ target_rms=target_rms,
456
+ cross_fade_duration=cross_fade_duration,
457
+ nfe_step=nfe_step,
458
+ cfg_strength=cfg_strength,
459
+ sway_sampling_coef=sway_sampling_coef,
460
+ use_acc_grl=use_acc_grl,
461
+ use_prosody_encoder=use_prosody_encoder,
462
+ ref_ratio=ref_ratio,
463
+ no_ref_audio=no_ref_audio,
464
+ speed=speed,
465
+ fix_duration=fix_duration,
466
+ device=device,
467
+ )
468
+ )
469
+
470
+
471
+ # infer batches
472
+
473
+
474
+ def infer_batch_process(
475
+ ref_audio,
476
+ ref_text,
477
+ gen_text_batches,
478
+ model_obj,
479
+ vocoder,
480
+ mel_spec_type="vocos",
481
+ progress=tqdm,
482
+ target_rms=0.1,
483
+ cross_fade_duration=0.15,
484
+ nfe_step=32,
485
+ cfg_strength=2.0,
486
+ sway_sampling_coef=-1,
487
+ use_acc_grl=True,
488
+ use_prosody_encoder=True,
489
+ ref_ratio=None,
490
+ no_ref_audio=False,
491
+ speed=1,
492
+ fix_duration=None,
493
+ device=None,
494
+ streaming=False,
495
+ chunk_size=2048,
496
+ ):
497
+ audio, sr = ref_audio
498
+ if audio.shape[0] > 1:
499
+ audio = torch.mean(audio, dim=0, keepdim=True)
500
+
501
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
502
+ if rms < target_rms:
503
+ audio = audio * target_rms / rms
504
+ if sr != target_sample_rate:
505
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
506
+ audio = resampler(audio)
507
+ audio = audio.to(device)
508
+
509
+ generated_waves = []
510
+ spectrograms = []
511
+
512
+ if type(ref_text) == str:
513
+ if len(ref_text[-1].encode("utf-8")) == 1:
514
+ ref_text = ref_text + " "
515
+
516
+ def process_batch(gen_text):
517
+ local_speed = speed
518
+
519
+ if type(ref_text) == str:
520
+ if len(gen_text.encode("utf-8")) < 10:
521
+ local_speed = 0.3
522
+
523
+ # Prepare the text
524
+ text_list = [ref_text + gen_text]
525
+ final_text_list = convert_char_to_pinyin(text_list)
526
+ else:
527
+ final_text_list = [ref_text + gen_text]
528
+ print("final_text_list:", final_text_list)
529
+
530
+ ref_audio_len = audio.shape[-1] // hop_length
531
+ if fix_duration is not None:
532
+ duration = int(fix_duration * target_sample_rate / hop_length)
533
+ else:
534
+ # Calculate duration
535
+ ref_text_len = len(ref_text) # .encode("utf-8")
536
+ gen_text_len = len(gen_text) # .encode("utf-8")
537
+ duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / local_speed)
538
+
539
+ # inference
540
+ with torch.inference_mode():
541
+ generated, _ = model_obj.sample(
542
+ cond=audio,
543
+ text=final_text_list,
544
+ duration=duration,
545
+ steps=nfe_step,
546
+ cfg_strength=cfg_strength,
547
+ sway_sampling_coef=sway_sampling_coef,
548
+ use_acc_grl=use_acc_grl,
549
+ use_prosody_encoder=use_prosody_encoder,
550
+ ref_ratio=ref_ratio,
551
+ no_ref_audio=no_ref_audio,
552
+ )
553
+ del _
554
+
555
+ generated = generated.to(torch.float32) # generated mel spectrogram
556
+ generated = generated[:, ref_audio_len:, :]
557
+ generated = generated.permute(0, 2, 1)
558
+ if mel_spec_type == "vocos":
559
+ generated_wave = vocoder.decode(generated)
560
+ elif mel_spec_type == "bigvgan":
561
+ generated_wave = vocoder(generated)
562
+ if rms < target_rms:
563
+ generated_wave = generated_wave * rms / target_rms
564
+
565
+ # wav -> numpy
566
+ # generated_wave = torch.clip(generated_wave, -0.999, 0.999)
567
+ generated_wave = generated_wave.squeeze().cpu().numpy()
568
+
569
+ if streaming:
570
+ for j in range(0, len(generated_wave), chunk_size):
571
+ yield generated_wave[j : j + chunk_size], target_sample_rate
572
+ else:
573
+ generated_cpu = generated[0].cpu().numpy()
574
+ del generated
575
+ yield generated_wave, generated_cpu
576
+
577
+ if streaming:
578
+ for gen_text in progress.tqdm(gen_text_batches) if progress is not None else gen_text_batches:
579
+ for chunk in process_batch(gen_text):
580
+ yield chunk
581
+ else:
582
+ with ThreadPoolExecutor() as executor:
583
+ futures = [executor.submit(process_batch, gen_text) for gen_text in gen_text_batches]
584
+ for future in progress.tqdm(futures) if progress is not None else futures:
585
+ result = future.result()
586
+ if result:
587
+ generated_wave, generated_mel_spec = next(result)
588
+ generated_waves.append(generated_wave)
589
+ spectrograms.append(generated_mel_spec)
590
+
591
+ if generated_waves:
592
+ if cross_fade_duration <= 0:
593
+ # Simply concatenate
594
+ final_wave = np.concatenate(generated_waves)
595
+ else:
596
+ # Combine all generated waves with cross-fading
597
+ final_wave = generated_waves[0]
598
+ for i in range(1, len(generated_waves)):
599
+ prev_wave = final_wave
600
+ next_wave = generated_waves[i]
601
+
602
+ # Calculate cross-fade samples, ensuring it does not exceed wave lengths
603
+ cross_fade_samples = int(cross_fade_duration * target_sample_rate)
604
+ cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
605
+
606
+ if cross_fade_samples <= 0:
607
+ # No overlap possible, concatenate
608
+ final_wave = np.concatenate([prev_wave, next_wave])
609
+ continue
610
+
611
+ # Overlapping parts
612
+ prev_overlap = prev_wave[-cross_fade_samples:]
613
+ next_overlap = next_wave[:cross_fade_samples]
614
+
615
+ # Fade out and fade in
616
+ fade_out = np.linspace(1, 0, cross_fade_samples)
617
+ fade_in = np.linspace(0, 1, cross_fade_samples)
618
+
619
+ # Cross-faded overlap
620
+ cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
621
+
622
+ # Combine
623
+ new_wave = np.concatenate(
624
+ [prev_wave[:-cross_fade_samples], cross_faded_overlap, next_wave[cross_fade_samples:]]
625
+ )
626
+
627
+ final_wave = new_wave
628
+
629
+ # Create a combined spectrogram
630
+ combined_spectrogram = np.concatenate(spectrograms, axis=1)
631
+ final_wave = np.clip(final_wave, -0.999, 0.999)
632
+ yield final_wave, target_sample_rate, combined_spectrogram
633
+
634
+ else:
635
+ yield None, target_sample_rate, None
636
+
637
+
638
+ # remove silence from generated wav
639
+
640
+
641
+ def remove_silence_for_generated_wav(filename):
642
+ aseg = AudioSegment.from_file(filename)
643
+ non_silent_segs = silence.split_on_silence(
644
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500, seek_step=10
645
+ )
646
+ non_silent_wave = AudioSegment.silent(duration=0)
647
+ for non_silent_seg in non_silent_segs:
648
+ non_silent_wave += non_silent_seg
649
+ aseg = non_silent_wave
650
+ aseg.export(filename, format="wav")
651
+
652
+
653
+ # save spectrogram
654
+
655
+
656
+ def save_spectrogram(spectrogram, path):
657
+ plt.figure(figsize=(12, 4))
658
+ plt.imshow(spectrogram, origin="lower", aspect="auto")
659
+ plt.colorbar()
660
+ plt.savefig(path)
661
+ plt.close()
lemas_tts/model/backbones/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Backbones quick introduction
2
+
3
+
4
+ ### unett.py
5
+ - flat unet transformer
6
+ - structure same as in e2-tts & voicebox paper except using rotary pos emb
7
+ - possible abs pos emb & convnextv2 blocks for embedded text before concat
8
+
9
+ ### dit.py
10
+ - adaln-zero dit
11
+ - embedded timestep as condition
12
+ - concatted noised_input + masked_cond + embedded_text, linear proj in
13
+ - possible abs pos emb & convnextv2 blocks for embedded text before concat
14
+ - possible long skip connection (first layer to last layer)
15
+
16
+ ### mmdit.py
17
+ - stable diffusion 3 block structure
18
+ - timestep as condition
19
+ - left stream: text embedded and applied a abs pos emb
20
+ - right stream: masked_cond & noised_input concatted and with same conv pos emb as unett
lemas_tts/model/backbones/dit.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Optional
13
+
14
+ import torch
15
+ from torch import nn
16
+ import torch.nn.functional as F
17
+
18
+ from x_transformers.x_transformers import RotaryEmbedding
19
+
20
+ from lemas_tts.model.modules import (
21
+ TimestepEmbedding,
22
+ ConvNeXtV2Block,
23
+ ConvPositionEmbedding,
24
+ DiTBlock,
25
+ AdaLayerNorm_Final,
26
+ precompute_freqs_cis,
27
+ get_pos_embed_indices,
28
+ )
29
+ from lemas_tts.model.backbones.ecapa_tdnn import ECAPA_TDNN
30
+
31
+ # Text embedding
32
+
33
+
34
+ class TextEmbedding(nn.Module):
35
+ def __init__(self, text_num_embeds, text_dim, mask_padding=True, conv_layers=0, conv_mult=2):
36
+ super().__init__()
37
+ self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
38
+
39
+ self.mask_padding = mask_padding # mask filler and batch padding tokens or not
40
+
41
+ if conv_layers > 0:
42
+ self.extra_modeling = True
43
+ self.precompute_max_pos = 4096 # ~44s of 24khz audio
44
+ self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
45
+ self.text_blocks = nn.Sequential(
46
+ *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
47
+ )
48
+ else:
49
+ self.extra_modeling = False
50
+
51
+ def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
52
+ text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
53
+ text = text[:, :seq_len] # curtail if character tokens are more than the mel spec tokens
54
+ batch, text_len = text.shape[0], text.shape[1]
55
+ text = F.pad(text, (0, seq_len - text_len), value=0)
56
+ if self.mask_padding:
57
+ text_mask = text == 0
58
+
59
+ if drop_text: # cfg for text
60
+ text = torch.zeros_like(text)
61
+
62
+ text = self.text_embed(text) # b n -> b n d
63
+
64
+ # possible extra modeling
65
+ if self.extra_modeling:
66
+ # sinus pos emb
67
+ batch_start = torch.zeros((batch,), dtype=torch.long)
68
+ pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
69
+ text_pos_embed = self.freqs_cis[pos_idx]
70
+ text = text + text_pos_embed
71
+
72
+ # convnextv2 blocks
73
+ if self.mask_padding:
74
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
75
+ for block in self.text_blocks:
76
+ text = block(text)
77
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
78
+ else:
79
+ text = self.text_blocks(text)
80
+
81
+ return text
82
+
83
+
84
+ # noised input audio and context mixing embedding
85
+
86
+
87
+ class InputEmbedding(nn.Module):
88
+ def __init__(self, mel_dim, text_dim, out_dim):
89
+ super().__init__()
90
+ self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
91
+ self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
92
+
93
+ def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
94
+ if drop_audio_cond: # cfg for cond audio
95
+ cond = torch.zeros_like(cond)
96
+
97
+ x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
98
+ x = self.conv_pos_embed(x) + x
99
+ return x
100
+
101
+
102
+ # Transformer backbone using DiT blocks
103
+
104
+
105
+ class DiT(nn.Module):
106
+ def __init__(
107
+ self,
108
+ *,
109
+ dim,
110
+ depth=8,
111
+ heads=8,
112
+ dim_head=64,
113
+ dropout=0.1,
114
+ ff_mult=4,
115
+ mel_dim=100,
116
+ text_num_embeds=256,
117
+ text_dim=None,
118
+ text_mask_padding=True,
119
+ qk_norm=None,
120
+ conv_layers=0,
121
+ pe_attn_head=None,
122
+ long_skip_connection=False,
123
+ checkpoint_activations=False,
124
+ use_prosody_encoder=False,
125
+ ):
126
+ super().__init__()
127
+
128
+ self.time_embed = TimestepEmbedding(dim)
129
+ if text_dim is None:
130
+ text_dim = mel_dim
131
+ self.text_embed = TextEmbedding(
132
+ text_num_embeds, text_dim, mask_padding=text_mask_padding, conv_layers=conv_layers
133
+ )
134
+ # project prosody embeddings (512-dim) to text_dim for conditioning
135
+ self.use_prosody_encoder = use_prosody_encoder
136
+ if use_prosody_encoder:
137
+ self.prosody_text_proj = nn.Linear(512, text_dim)
138
+ else:
139
+ self.prosody_text_proj = None
140
+ self.text_cond, self.text_uncond = None, None # text cache
141
+ self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
142
+
143
+ self.rotary_embed = RotaryEmbedding(dim_head)
144
+
145
+ self.dim = dim
146
+ self.depth = depth
147
+
148
+ self.transformer_blocks = nn.ModuleList(
149
+ [
150
+ DiTBlock(
151
+ dim=dim,
152
+ heads=heads,
153
+ dim_head=dim_head,
154
+ ff_mult=ff_mult,
155
+ dropout=dropout,
156
+ qk_norm=qk_norm,
157
+ pe_attn_head=pe_attn_head,
158
+ )
159
+ for _ in range(depth)
160
+ ]
161
+ )
162
+ self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
163
+
164
+ self.norm_out = AdaLayerNorm_Final(dim) # final modulation
165
+ self.proj_out = nn.Linear(dim, mel_dim)
166
+
167
+ self.checkpoint_activations = checkpoint_activations
168
+
169
+ self.initialize_weights()
170
+
171
+ def initialize_weights(self):
172
+ # Zero-out AdaLN layers in DiT blocks:
173
+ for block in self.transformer_blocks:
174
+ nn.init.constant_(block.attn_norm.linear.weight, 0)
175
+ nn.init.constant_(block.attn_norm.linear.bias, 0)
176
+
177
+ # Zero-out output layers:
178
+ nn.init.constant_(self.norm_out.linear.weight, 0)
179
+ nn.init.constant_(self.norm_out.linear.bias, 0)
180
+ nn.init.constant_(self.proj_out.weight, 0)
181
+ nn.init.constant_(self.proj_out.bias, 0)
182
+
183
+ def ckpt_wrapper(self, module):
184
+ # https://github.com/chuanyangjin/fast-DiT/blob/main/models.py
185
+ def ckpt_forward(*inputs):
186
+ outputs = module(*inputs)
187
+ return outputs
188
+
189
+ return ckpt_forward
190
+
191
+ def clear_cache(self):
192
+ self.text_cond, self.text_uncond = None, None
193
+
194
+ def forward(
195
+ self,
196
+ x: float["b n d"], # nosied input audio # noqa: F722
197
+ cond: float["b n d"], # masked cond audio # noqa: F722
198
+ text: int["b nt"], # text # noqa: F722
199
+ time: float["b"] | float[""], # time step # noqa: F821 F722
200
+ drop_audio_cond, # cfg for cond audio
201
+ drop_text, # cfg for text
202
+ mask: bool["b n"] | None = None, # noqa: F722
203
+ cache=False,
204
+ prosody_text: Optional[torch.Tensor] = None,
205
+ ):
206
+ batch, seq_len = x.shape[0], x.shape[1]
207
+ if time.ndim == 0:
208
+ time = time.repeat(batch)
209
+
210
+ # t: conditioning time, text: text, x: noised audio + cond audio + text
211
+ t = self.time_embed(time)
212
+ if cache:
213
+ if drop_text:
214
+ if self.text_uncond is None:
215
+ self.text_uncond = self.text_embed(text, seq_len, drop_text=True)
216
+ text_embed = self.text_uncond
217
+ else:
218
+ if self.text_cond is None:
219
+ self.text_cond = self.text_embed(text, seq_len, drop_text=False)
220
+ text_embed = self.text_cond
221
+ else:
222
+ text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
223
+
224
+ # optional prosody conditioning on text side
225
+ if prosody_text is not None and self.use_prosody_encoder:
226
+ # prosody_text: (B, T_text, 512) -> project to text_dim and align to seq_len
227
+ pt = self.prosody_text_proj(prosody_text)
228
+ if pt.size(1) < seq_len:
229
+ pad_len = seq_len - pt.size(1)
230
+ pt = F.pad(pt, (0, 0, 0, pad_len))
231
+ elif pt.size(1) > seq_len:
232
+ pt = pt[:, :seq_len]
233
+ text_embed = text_embed + pt
234
+ x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
235
+
236
+ rope = self.rotary_embed.forward_from_seq_len(seq_len)
237
+
238
+ if self.long_skip_connection is not None:
239
+ residual = x
240
+
241
+ for block in self.transformer_blocks:
242
+ if self.checkpoint_activations:
243
+ # https://pytorch.org/docs/stable/checkpoint.html#torch.utils.checkpoint.checkpoint
244
+ x = torch.utils.checkpoint.checkpoint(self.ckpt_wrapper(block), x, t, mask, rope, use_reentrant=False)
245
+ else:
246
+ x = block(x, t, mask=mask, rope=rope)
247
+
248
+ if self.long_skip_connection is not None:
249
+ x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
250
+
251
+ x = self.norm_out(x, t)
252
+ output = self.proj_out(x)
253
+
254
+ return output
lemas_tts/model/backbones/ecapa_tdnn.py ADDED
@@ -0,0 +1,931 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A popular speaker recognition and diarization model.
2
+
3
+ Authors
4
+ * Hwidong Na 2020
5
+ """
6
+
7
+ import math
8
+ import os
9
+ import torch # noqa: F401
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+
13
+
14
+ def length_to_mask(length, max_len=None, dtype=None, device=None):
15
+ """Creates a binary mask for each sequence.
16
+
17
+ Reference: https://discuss.pytorch.org/t/how-to-generate-variable-length-mask/23397/3
18
+
19
+ Arguments
20
+ ---------
21
+ length : torch.LongTensor
22
+ Containing the length of each sequence in the batch. Must be 1D.
23
+ max_len : int
24
+ Max length for the mask, also the size of the second dimension.
25
+ dtype : torch.dtype, default: None
26
+ The dtype of the generated mask.
27
+ device: torch.device, default: None
28
+ The device to put the mask variable.
29
+
30
+ Returns
31
+ -------
32
+ mask : tensor
33
+ The binary mask.
34
+
35
+ Example
36
+ -------
37
+ >>> length=torch.Tensor([1,2,3])
38
+ >>> mask=length_to_mask(length)
39
+ >>> mask
40
+ tensor([[1., 0., 0.],
41
+ [1., 1., 0.],
42
+ [1., 1., 1.]])
43
+ """
44
+ assert len(length.shape) == 1
45
+
46
+ if max_len is None:
47
+ max_len = length.max().long().item() # using arange to generate mask
48
+ mask = torch.arange(max_len, device=length.device, dtype=length.dtype).expand(
49
+ len(length), max_len
50
+ ) < length.unsqueeze(1)
51
+
52
+ if dtype is None:
53
+ dtype = length.dtype
54
+
55
+ if device is None:
56
+ device = length.device
57
+
58
+ mask = torch.as_tensor(mask, dtype=dtype, device=device)
59
+ return mask
60
+
61
+
62
+ def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
63
+ """This function computes the number of elements to add for zero-padding.
64
+
65
+ Arguments
66
+ ---------
67
+ L_in : int
68
+ stride: int
69
+ kernel_size : int
70
+ dilation : int
71
+ """
72
+ if stride > 1:
73
+ n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
74
+ L_out = stride * (n_steps - 1) + kernel_size * dilation
75
+ padding = [kernel_size // 2, kernel_size // 2]
76
+
77
+ else:
78
+ L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
79
+
80
+ padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
81
+ return padding
82
+
83
+
84
+ class Conv1d(nn.Module):
85
+ """This function implements 1d convolution.
86
+
87
+ Arguments
88
+ ---------
89
+ out_channels : int
90
+ It is the number of output channels.
91
+ kernel_size : int
92
+ Kernel size of the convolutional filters.
93
+ input_shape : tuple
94
+ The shape of the input. Alternatively use ``in_channels``.
95
+ in_channels : int
96
+ The number of input channels. Alternatively use ``input_shape``.
97
+ stride : int
98
+ Stride factor of the convolutional filters. When the stride factor > 1,
99
+ a decimation in time is performed.
100
+ dilation : int
101
+ Dilation factor of the convolutional filters.
102
+ padding : str
103
+ (same, valid, causal). If "valid", no padding is performed.
104
+ If "same" and stride is 1, output shape is the same as the input shape.
105
+ "causal" results in causal (dilated) convolutions.
106
+ padding_mode : str
107
+ This flag specifies the type of padding. See torch.nn documentation
108
+ for more information.
109
+ skip_transpose : bool
110
+ If False, uses batch x time x channel convention of speechbrain.
111
+ If True, uses batch x channel x time convention.
112
+
113
+ Example
114
+ -------
115
+ >>> inp_tensor = torch.rand([10, 40, 16])
116
+ >>> cnn_1d = Conv1d(
117
+ ... input_shape=inp_tensor.shape, out_channels=8, kernel_size=5
118
+ ... )
119
+ >>> out_tensor = cnn_1d(inp_tensor)
120
+ >>> out_tensor.shape
121
+ torch.Size([10, 40, 8])
122
+ """
123
+
124
+ def __init__(
125
+ self,
126
+ out_channels,
127
+ kernel_size,
128
+ input_shape=None,
129
+ in_channels=None,
130
+ stride=1,
131
+ dilation=1,
132
+ padding="same",
133
+ groups=1,
134
+ bias=True,
135
+ padding_mode="reflect",
136
+ skip_transpose=True,
137
+ ):
138
+ super().__init__()
139
+ self.kernel_size = kernel_size
140
+ self.stride = stride
141
+ self.dilation = dilation
142
+ self.padding = padding
143
+ self.padding_mode = padding_mode
144
+ self.unsqueeze = False
145
+ self.skip_transpose = skip_transpose
146
+
147
+ if input_shape is None and in_channels is None:
148
+ raise ValueError("Must provide one of input_shape or in_channels")
149
+
150
+ if in_channels is None:
151
+ in_channels = self._check_input_shape(input_shape)
152
+
153
+ self.conv = nn.Conv1d(
154
+ in_channels,
155
+ out_channels,
156
+ self.kernel_size,
157
+ stride=self.stride,
158
+ dilation=self.dilation,
159
+ padding=0,
160
+ groups=groups,
161
+ bias=bias,
162
+ )
163
+
164
+ def forward(self, x):
165
+ """Returns the output of the convolution.
166
+
167
+ Arguments
168
+ ---------
169
+ x : torch.Tensor (batch, time, channel)
170
+ input to convolve. 2d or 4d tensors are expected.
171
+ """
172
+
173
+ if not self.skip_transpose:
174
+ x = x.transpose(1, -1)
175
+
176
+ if self.unsqueeze:
177
+ x = x.unsqueeze(1)
178
+
179
+ if self.padding == "same":
180
+ x = self._manage_padding(x, self.kernel_size, self.dilation, self.stride)
181
+
182
+ elif self.padding == "causal":
183
+ num_pad = (self.kernel_size - 1) * self.dilation
184
+ x = F.pad(x, (num_pad, 0))
185
+
186
+ elif self.padding == "valid":
187
+ pass
188
+
189
+ else:
190
+ raise ValueError(
191
+ "Padding must be 'same', 'valid' or 'causal'. Got " + self.padding
192
+ )
193
+
194
+ wx = self.conv(x.to(self.conv.weight.dtype))
195
+
196
+ if self.unsqueeze:
197
+ wx = wx.squeeze(1)
198
+
199
+ if not self.skip_transpose:
200
+ wx = wx.transpose(1, -1)
201
+
202
+ return wx
203
+
204
+ def _manage_padding(
205
+ self,
206
+ x,
207
+ kernel_size: int,
208
+ dilation: int,
209
+ stride: int,
210
+ ):
211
+ """This function performs zero-padding on the time axis
212
+ such that their lengths is unchanged after the convolution.
213
+
214
+ Arguments
215
+ ---------
216
+ x : torch.Tensor
217
+ Input tensor.
218
+ kernel_size : int
219
+ Size of kernel.
220
+ dilation : int
221
+ Dilation used.
222
+ stride : int
223
+ Stride.
224
+ """
225
+
226
+ # Detecting input shape
227
+ L_in = x.shape[-1]
228
+
229
+ # Time padding
230
+ padding = get_padding_elem(L_in, stride, kernel_size, dilation)
231
+
232
+ # Applying padding
233
+ x = F.pad(x, padding, mode=self.padding_mode)
234
+
235
+ return x
236
+
237
+ def _check_input_shape(self, shape):
238
+ """Checks the input shape and returns the number of input channels."""
239
+
240
+ if len(shape) == 2:
241
+ self.unsqueeze = True
242
+ in_channels = 1
243
+ elif self.skip_transpose:
244
+ in_channels = shape[1]
245
+ elif len(shape) == 3:
246
+ in_channels = shape[2]
247
+ else:
248
+ raise ValueError("conv1d expects 2d, 3d inputs. Got " + str(len(shape)))
249
+
250
+ # Kernel size must be odd
251
+ if self.kernel_size % 2 == 0:
252
+ raise ValueError(
253
+ "The field kernel size must be an odd number. Got %s."
254
+ % (self.kernel_size)
255
+ )
256
+ return in_channels
257
+
258
+
259
+ class Fp32BatchNorm(nn.Module):
260
+ def __init__(self, sync=True, *args, **kwargs):
261
+ super().__init__()
262
+
263
+ if (
264
+ not torch.distributed.is_initialized()
265
+ or torch.distributed.get_world_size() == 1
266
+ ):
267
+ sync = False
268
+
269
+ if sync:
270
+ self.bn = nn.SyncBatchNorm(*args, **kwargs)
271
+ else:
272
+ self.bn = nn.BatchNorm1d(*args, **kwargs)
273
+
274
+ self.sync = sync
275
+
276
+ def forward(self, input):
277
+ if self.bn.running_mean.dtype != torch.float:
278
+ if self.sync:
279
+ self.bn.running_mean = self.bn.running_mean.float()
280
+ self.bn.running_var = self.bn.running_var.float()
281
+ if self.bn.affine:
282
+ try:
283
+ self.bn.weight = self.bn.weight.float()
284
+ self.bn.bias = self.bn.bias.float()
285
+ except:
286
+ self.bn.float()
287
+ else:
288
+ self.bn.float()
289
+
290
+ output = self.bn(input.float())
291
+ return output.type_as(input)
292
+
293
+
294
+ class BatchNorm1d(nn.Module):
295
+ """Applies 1d batch normalization to the input tensor.
296
+
297
+ Arguments
298
+ ---------
299
+ input_shape : tuple
300
+ The expected shape of the input. Alternatively, use ``input_size``.
301
+ input_size : int
302
+ The expected size of the input. Alternatively, use ``input_shape``.
303
+ eps : float
304
+ This value is added to std deviation estimation to improve the numerical
305
+ stability.
306
+ momentum : float
307
+ It is a value used for the running_mean and running_var computation.
308
+ affine : bool
309
+ When set to True, the affine parameters are learned.
310
+ track_running_stats : bool
311
+ When set to True, this module tracks the running mean and variance,
312
+ and when set to False, this module does not track such statistics.
313
+ combine_batch_time : bool
314
+ When true, it combines batch an time axis.
315
+
316
+
317
+ Example
318
+ -------
319
+ >>> input = torch.randn(100, 10)
320
+ >>> norm = BatchNorm1d(input_shape=input.shape)
321
+ >>> output = norm(input)
322
+ >>> output.shape
323
+ torch.Size([100, 10])
324
+ """
325
+
326
+ def __init__(
327
+ self,
328
+ input_shape=None,
329
+ input_size=None,
330
+ eps=1e-05,
331
+ momentum=0.1,
332
+ affine=True,
333
+ track_running_stats=True,
334
+ combine_batch_time=False,
335
+ skip_transpose=True,
336
+ enabled=True,
337
+ ):
338
+ super().__init__()
339
+ self.combine_batch_time = combine_batch_time
340
+ self.skip_transpose = skip_transpose
341
+
342
+ if input_size is None and skip_transpose:
343
+ input_size = input_shape[1]
344
+ elif input_size is None:
345
+ input_size = input_shape[-1]
346
+
347
+ if enabled:
348
+ self.norm = Fp32BatchNorm(
349
+ num_features=input_size,
350
+ eps=eps,
351
+ momentum=momentum,
352
+ affine=affine,
353
+ track_running_stats=track_running_stats,
354
+ )
355
+ else:
356
+ self.norm = nn.Identity()
357
+
358
+ def forward(self, x):
359
+ """Returns the normalized input tensor.
360
+
361
+ Arguments
362
+ ---------
363
+ x : torch.Tensor (batch, time, [channels])
364
+ input to normalize. 2d or 3d tensors are expected in input
365
+ 4d tensors can be used when combine_dims=True.
366
+ """
367
+ shape_or = x.shape
368
+ if self.combine_batch_time:
369
+ if x.ndim == 3:
370
+ x = x.reshape(shape_or[0] * shape_or[1], shape_or[2])
371
+ else:
372
+ x = x.reshape(shape_or[0] * shape_or[1], shape_or[3], shape_or[2])
373
+
374
+ elif not self.skip_transpose:
375
+ x = x.transpose(-1, 1)
376
+
377
+ x_n = self.norm(x)
378
+
379
+ if self.combine_batch_time:
380
+ x_n = x_n.reshape(shape_or)
381
+ elif not self.skip_transpose:
382
+ x_n = x_n.transpose(1, -1)
383
+
384
+ return x_n
385
+
386
+
387
+ class Linear(torch.nn.Module):
388
+ """Computes a linear transformation y = wx + b.
389
+
390
+ Arguments
391
+ ---------
392
+ n_neurons : int
393
+ It is the number of output neurons (i.e, the dimensionality of the
394
+ output).
395
+ bias : bool
396
+ If True, the additive bias b is adopted.
397
+ combine_dims : bool
398
+ If True and the input is 4D, combine 3rd and 4th dimensions of input.
399
+
400
+ Example
401
+ -------
402
+ >>> inputs = torch.rand(10, 50, 40)
403
+ >>> lin_t = Linear(input_shape=(10, 50, 40), n_neurons=100)
404
+ >>> output = lin_t(inputs)
405
+ >>> output.shape
406
+ torch.Size([10, 50, 100])
407
+ """
408
+
409
+ def __init__(
410
+ self,
411
+ n_neurons,
412
+ input_shape=None,
413
+ input_size=None,
414
+ bias=True,
415
+ combine_dims=False,
416
+ ):
417
+ super().__init__()
418
+ self.combine_dims = combine_dims
419
+
420
+ if input_shape is None and input_size is None:
421
+ raise ValueError("Expected one of input_shape or input_size")
422
+
423
+ if input_size is None:
424
+ input_size = input_shape[-1]
425
+ if len(input_shape) == 4 and self.combine_dims:
426
+ input_size = input_shape[2] * input_shape[3]
427
+
428
+ # Weights are initialized following pytorch approach
429
+ self.w = nn.Linear(input_size, n_neurons, bias=bias)
430
+
431
+ def forward(self, x):
432
+ """Returns the linear transformation of input tensor.
433
+
434
+ Arguments
435
+ ---------
436
+ x : torch.Tensor
437
+ Input to transform linearly.
438
+ """
439
+ if x.ndim == 4 and self.combine_dims:
440
+ x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3])
441
+
442
+ wx = self.w(x)
443
+
444
+ return wx
445
+
446
+
447
+ class TDNNBlock(nn.Module):
448
+ """An implementation of TDNN.
449
+
450
+ Arguments
451
+ ----------
452
+ in_channels : int
453
+ Number of input channels.
454
+ out_channels : int
455
+ The number of output channels.
456
+ kernel_size : int
457
+ The kernel size of the TDNN blocks.
458
+ dilation : int
459
+ The dilation of the Res2Net block.
460
+ activation : torch class
461
+ A class for constructing the activation layers.
462
+
463
+ Example
464
+ -------
465
+ >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
466
+ >>> layer = TDNNBlock(64, 64, kernel_size=3, dilation=1)
467
+ >>> out_tensor = layer(inp_tensor).transpose(1, 2)
468
+ >>> out_tensor.shape
469
+ torch.Size([8, 120, 64])
470
+ """
471
+
472
+ def __init__(
473
+ self,
474
+ in_channels,
475
+ out_channels,
476
+ kernel_size,
477
+ dilation,
478
+ activation=nn.ReLU,
479
+ batch_norm=True,
480
+ ):
481
+ super(TDNNBlock, self).__init__()
482
+ self.conv = Conv1d(
483
+ in_channels=in_channels,
484
+ out_channels=out_channels,
485
+ kernel_size=kernel_size,
486
+ dilation=dilation,
487
+ )
488
+ self.activation = activation()
489
+ self.norm = BatchNorm1d(input_size=out_channels, enabled=batch_norm)
490
+
491
+ def forward(self, x):
492
+ return self.norm(self.activation(self.conv(x)))
493
+
494
+
495
+ class Res2NetBlock(torch.nn.Module):
496
+ """An implementation of Res2NetBlock w/ dilation.
497
+
498
+ Arguments
499
+ ---------
500
+ in_channels : int
501
+ The number of channels expected in the input.
502
+ out_channels : int
503
+ The number of output channels.
504
+ scale : int
505
+ The scale of the Res2Net block.
506
+ kernel_size: int
507
+ The kernel size of the Res2Net block.
508
+ dilation : int
509
+ The dilation of the Res2Net block.
510
+
511
+ Example
512
+ -------
513
+ >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
514
+ >>> layer = Res2NetBlock(64, 64, scale=4, dilation=3)
515
+ >>> out_tensor = layer(inp_tensor).transpose(1, 2)
516
+ >>> out_tensor.shape
517
+ torch.Size([8, 120, 64])
518
+ """
519
+
520
+ def __init__(
521
+ self,
522
+ in_channels,
523
+ out_channels,
524
+ scale=8,
525
+ kernel_size=3,
526
+ dilation=1,
527
+ batch_norm=True,
528
+ ):
529
+ super(Res2NetBlock, self).__init__()
530
+ assert in_channels % scale == 0
531
+ assert out_channels % scale == 0
532
+
533
+ in_channel = in_channels // scale
534
+ hidden_channel = out_channels // scale
535
+
536
+ self.blocks = nn.ModuleList(
537
+ [
538
+ TDNNBlock(
539
+ in_channel,
540
+ hidden_channel,
541
+ kernel_size=kernel_size,
542
+ dilation=dilation,
543
+ batch_norm=batch_norm,
544
+ )
545
+ for i in range(scale - 1)
546
+ ]
547
+ )
548
+ self.scale = scale
549
+
550
+ def forward(self, x):
551
+ y = []
552
+ for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
553
+ if i == 0:
554
+ y_i = x_i
555
+ elif i == 1:
556
+ y_i = self.blocks[i - 1](x_i)
557
+ else:
558
+ y_i = self.blocks[i - 1](x_i + y_i)
559
+ y.append(y_i)
560
+ y = torch.cat(y, dim=1)
561
+ return y
562
+
563
+
564
+ class SEBlock(nn.Module):
565
+ """An implementation of squeeze-and-excitation block.
566
+
567
+ Arguments
568
+ ---------
569
+ in_channels : int
570
+ The number of input channels.
571
+ se_channels : int
572
+ The number of output channels after squeeze.
573
+ out_channels : int
574
+ The number of output channels.
575
+
576
+ Example
577
+ -------
578
+ >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
579
+ >>> se_layer = SEBlock(64, 16, 64)
580
+ >>> lengths = torch.rand((8,))
581
+ >>> out_tensor = se_layer(inp_tensor, lengths).transpose(1, 2)
582
+ >>> out_tensor.shape
583
+ torch.Size([8, 120, 64])
584
+ """
585
+
586
+ def __init__(self, in_channels, se_channels, out_channels):
587
+ super(SEBlock, self).__init__()
588
+
589
+ self.conv1 = Conv1d(
590
+ in_channels=in_channels, out_channels=se_channels, kernel_size=1
591
+ )
592
+ self.relu = torch.nn.ReLU(inplace=True)
593
+ self.conv2 = Conv1d(
594
+ in_channels=se_channels, out_channels=out_channels, kernel_size=1
595
+ )
596
+ self.sigmoid = torch.nn.Sigmoid()
597
+
598
+ def forward(self, x, lengths=None):
599
+ L = x.shape[-1]
600
+ if lengths is not None:
601
+ mask = length_to_mask(lengths * L, max_len=L, device=x.device)
602
+ mask = mask.unsqueeze(1)
603
+ total = mask.sum(dim=2, keepdim=True)
604
+ s = (x * mask).sum(dim=2, keepdim=True) / total
605
+ else:
606
+ s = x.mean(dim=2, keepdim=True)
607
+
608
+ s = self.relu(self.conv1(s))
609
+ s = self.sigmoid(self.conv2(s))
610
+
611
+ return s * x
612
+
613
+
614
+ class AttentiveStatisticsPooling(nn.Module):
615
+ """This class implements an attentive statistic pooling layer for each channel.
616
+ It returns the concatenated mean and std of the input tensor.
617
+
618
+ Arguments
619
+ ---------
620
+ channels: int
621
+ The number of input channels.
622
+ attention_channels: int
623
+ The number of attention channels.
624
+
625
+ Example
626
+ -------
627
+ >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
628
+ >>> asp_layer = AttentiveStatisticsPooling(64)
629
+ >>> lengths = torch.rand((8,))
630
+ >>> out_tensor = asp_layer(inp_tensor, lengths).transpose(1, 2)
631
+ >>> out_tensor.shape
632
+ torch.Size([8, 1, 128])
633
+ """
634
+
635
+ def __init__(
636
+ self, channels, attention_channels=128, global_context=True, batch_norm=True
637
+ ):
638
+ super().__init__()
639
+
640
+ self.eps = 1e-12
641
+ self.global_context = global_context
642
+ if global_context:
643
+ self.tdnn = TDNNBlock(
644
+ channels * 3, attention_channels, 1, 1, batch_norm=batch_norm
645
+ )
646
+ else:
647
+ self.tdnn = TDNNBlock(
648
+ channels, attention_channels, 1, 1, batch_norm, batch_norm
649
+ )
650
+ self.tanh = nn.Tanh()
651
+ self.conv = Conv1d(
652
+ in_channels=attention_channels, out_channels=channels, kernel_size=1
653
+ )
654
+
655
+ def forward(self, x, lengths=None):
656
+ """Calculates mean and std for a batch (input tensor).
657
+
658
+ Arguments
659
+ ---------
660
+ x : torch.Tensor
661
+ Tensor of shape [N, C, L].
662
+ """
663
+ L = x.shape[-1]
664
+
665
+ def _compute_statistics(x, m, dim=2, eps=self.eps):
666
+ mean = (m * x).sum(dim)
667
+ std = torch.sqrt((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
668
+ return mean, std
669
+
670
+ if lengths is None:
671
+ lengths = torch.ones(x.shape[0], device=x.device)
672
+
673
+ # Make binary mask of shape [N, 1, L]
674
+ mask = length_to_mask(lengths * L, max_len=L, device=x.device)
675
+ mask = mask.unsqueeze(1)
676
+
677
+ # Expand the temporal context of the pooling layer by allowing the
678
+ # self-attention to look at global properties of the utterance.
679
+ if self.global_context:
680
+ # torch.std is unstable for backward computation
681
+ # https://github.com/pytorch/pytorch/issues/4320
682
+ total = mask.sum(dim=2, keepdim=True).float()
683
+ mean, std = _compute_statistics(x, mask / total)
684
+ mean = mean.unsqueeze(2).repeat(1, 1, L)
685
+ std = std.unsqueeze(2).repeat(1, 1, L)
686
+ attn = torch.cat([x, mean, std], dim=1)
687
+ else:
688
+ attn = x
689
+
690
+ # Apply layers
691
+ attn = self.conv(self.tanh(self.tdnn(attn)))
692
+
693
+ # Filter out zero-paddings
694
+ attn = attn.masked_fill(mask == 0, float("-inf"))
695
+
696
+ attn = F.softmax(attn, dim=2)
697
+ mean, std = _compute_statistics(x, attn)
698
+ # Append mean and std of the batch
699
+ pooled_stats = torch.cat((mean, std), dim=1)
700
+ pooled_stats = pooled_stats.unsqueeze(2)
701
+
702
+ return pooled_stats
703
+
704
+
705
+ class SERes2NetBlock(nn.Module):
706
+ """An implementation of building block in ECAPA-TDNN, i.e.,
707
+ TDNN-Res2Net-TDNN-SEBlock.
708
+
709
+ Arguments
710
+ ----------
711
+ out_channels: int
712
+ The number of output channels.
713
+ res2net_scale: int
714
+ The scale of the Res2Net block.
715
+ kernel_size: int
716
+ The kernel size of the TDNN blocks.
717
+ dilation: int
718
+ The dilation of the Res2Net block.
719
+ activation : torch class
720
+ A class for constructing the activation layers.
721
+
722
+ Example
723
+ -------
724
+ >>> x = torch.rand(8, 120, 64).transpose(1, 2)
725
+ >>> conv = SERes2NetBlock(64, 64, res2net_scale=4)
726
+ >>> out = conv(x).transpose(1, 2)
727
+ >>> out.shape
728
+ torch.Size([8, 120, 64])
729
+ """
730
+
731
+ def __init__(
732
+ self,
733
+ in_channels,
734
+ out_channels,
735
+ res2net_scale=8,
736
+ se_channels=128,
737
+ kernel_size=1,
738
+ dilation=1,
739
+ activation=torch.nn.ReLU,
740
+ batch_norm=True,
741
+ ):
742
+ super().__init__()
743
+ self.out_channels = out_channels
744
+ self.tdnn1 = TDNNBlock(
745
+ in_channels,
746
+ out_channels,
747
+ kernel_size=1,
748
+ dilation=1,
749
+ activation=activation,
750
+ batch_norm=batch_norm,
751
+ )
752
+ self.res2net_block = Res2NetBlock(
753
+ out_channels,
754
+ out_channels,
755
+ res2net_scale,
756
+ kernel_size,
757
+ dilation,
758
+ batch_norm=batch_norm,
759
+ )
760
+ self.tdnn2 = TDNNBlock(
761
+ out_channels,
762
+ out_channels,
763
+ kernel_size=1,
764
+ dilation=1,
765
+ activation=activation,
766
+ batch_norm=batch_norm,
767
+ )
768
+ self.se_block = SEBlock(out_channels, se_channels, out_channels)
769
+
770
+ self.shortcut = None
771
+ if in_channels != out_channels:
772
+ self.shortcut = Conv1d(
773
+ in_channels=in_channels,
774
+ out_channels=out_channels,
775
+ kernel_size=1,
776
+ )
777
+
778
+ def forward(self, x, lengths=None):
779
+ residual = x
780
+ if self.shortcut:
781
+ residual = self.shortcut(x)
782
+
783
+ x = self.tdnn1(x)
784
+ x = self.res2net_block(x)
785
+ x = self.tdnn2(x)
786
+ x = self.se_block(x, lengths)
787
+
788
+ return x + residual
789
+
790
+
791
+ class ECAPA_TDNN(torch.nn.Module):
792
+ """An implementation of the speaker embedding model in a paper.
793
+ "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
794
+ TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
795
+
796
+ Arguments
797
+ ---------
798
+ device : str
799
+ Device used, e.g., "cpu" or "cuda".
800
+ activation : torch class
801
+ A class for constructing the activation layers.
802
+ channels : list of ints
803
+ Output channels for TDNN/SERes2Net layer.
804
+ kernel_sizes : list of ints
805
+ List of kernel sizes for each layer.
806
+ dilations : list of ints
807
+ List of dilations for kernels in each layer.
808
+ lin_neurons : int
809
+ Number of neurons in linear layers.
810
+
811
+ Example
812
+ -------
813
+ >>> input_feats = torch.rand([5, 120, 80])
814
+ >>> compute_embedding = ECAPA_TDNN(80, lin_neurons=192)
815
+ >>> outputs = compute_embedding(input_feats)
816
+ >>> outputs.shape
817
+ torch.Size([5, 1, 192])
818
+ """
819
+
820
+ def __init__(
821
+ self,
822
+ input_size,
823
+ lin_neurons=192,
824
+ activation=torch.nn.ReLU,
825
+ channels=[512, 512, 512, 512, 1536],
826
+ kernel_sizes=[5, 3, 3, 3, 1],
827
+ dilations=[1, 2, 3, 4, 1],
828
+ attention_channels=128,
829
+ res2net_scale=8,
830
+ se_channels=128,
831
+ global_context=True,
832
+ batch_norm=True,
833
+ ):
834
+
835
+ super().__init__()
836
+ assert len(channels) == len(kernel_sizes)
837
+ assert len(channels) == len(dilations)
838
+ self.channels = channels
839
+ self.blocks = nn.ModuleList()
840
+
841
+ # The initial TDNN layer
842
+ self.blocks.append(
843
+ TDNNBlock(
844
+ input_size,
845
+ channels[0],
846
+ kernel_sizes[0],
847
+ dilations[0],
848
+ activation,
849
+ batch_norm=batch_norm,
850
+ )
851
+ )
852
+
853
+ # SE-Res2Net layers
854
+ for i in range(1, len(channels) - 1):
855
+ self.blocks.append(
856
+ SERes2NetBlock(
857
+ channels[i - 1],
858
+ channels[i],
859
+ res2net_scale=res2net_scale,
860
+ se_channels=se_channels,
861
+ kernel_size=kernel_sizes[i],
862
+ dilation=dilations[i],
863
+ activation=activation,
864
+ batch_norm=batch_norm,
865
+ )
866
+ )
867
+
868
+ # Multi-layer feature aggregation
869
+ self.mfa = TDNNBlock(
870
+ channels[-1],
871
+ channels[-1],
872
+ kernel_sizes[-1],
873
+ dilations[-1],
874
+ activation,
875
+ batch_norm=batch_norm,
876
+ )
877
+
878
+ # Attentive Statistical Pooling
879
+ self.asp = AttentiveStatisticsPooling(
880
+ channels[-1],
881
+ attention_channels=attention_channels,
882
+ global_context=global_context,
883
+ batch_norm=batch_norm,
884
+ )
885
+ self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2, enabled=batch_norm)
886
+
887
+ # Final linear transformation
888
+ self.fc = Conv1d(
889
+ in_channels=channels[-1] * 2,
890
+ out_channels=input_size, # lin_neurons,
891
+ kernel_size=1,
892
+ )
893
+
894
+ # @torch.cuda.amp.autocast(enabled=True, dtype=torch.float32)
895
+ def forward(self, x, lengths=None):
896
+ """Returns the embedding vector.
897
+
898
+ Arguments
899
+ ---------
900
+ x : torch.Tensor
901
+ Tensor of shape (batch, time, channel).
902
+ """
903
+ # Minimize transpose for efficiency
904
+ x = x.transpose(1, 2)
905
+
906
+ xl = []
907
+ for layer in self.blocks:
908
+ try:
909
+ x = layer(x, lengths=lengths)
910
+ except TypeError:
911
+ x = layer(x)
912
+ xl.append(x)
913
+
914
+ # Multi-layer feature aggregation
915
+ x = torch.cat(xl[1:], dim=1)
916
+ x = self.mfa(x)
917
+
918
+ # Attentive Statistical Pooling
919
+ x = self.asp(x, lengths=lengths)
920
+ x = self.asp_bn(x)
921
+
922
+ # Final linear transformation
923
+ x = self.fc(x)
924
+
925
+ x = x.squeeze(-1)
926
+ return x
927
+
928
+
929
+ if __name__ == "__main__":
930
+ model = ECAPA_TDNN(128, batch_norm=False)
931
+ # print(model)
lemas_tts/model/backbones/mmdit.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import torch
13
+ from torch import nn
14
+
15
+ from x_transformers.x_transformers import RotaryEmbedding
16
+
17
+ from lemas_tts.model.modules import (
18
+ TimestepEmbedding,
19
+ ConvPositionEmbedding,
20
+ MMDiTBlock,
21
+ AdaLayerNorm_Final,
22
+ precompute_freqs_cis,
23
+ get_pos_embed_indices,
24
+ )
25
+
26
+
27
+ # text embedding
28
+
29
+
30
+ class TextEmbedding(nn.Module):
31
+ def __init__(self, out_dim, text_num_embeds, mask_padding=True):
32
+ super().__init__()
33
+ self.text_embed = nn.Embedding(text_num_embeds + 1, out_dim) # will use 0 as filler token
34
+
35
+ self.mask_padding = mask_padding # mask filler and batch padding tokens or not
36
+
37
+ self.precompute_max_pos = 1024
38
+ self.register_buffer("freqs_cis", precompute_freqs_cis(out_dim, self.precompute_max_pos), persistent=False)
39
+
40
+ def forward(self, text: int["b nt"], drop_text=False) -> int["b nt d"]: # noqa: F722
41
+ text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
42
+ if self.mask_padding:
43
+ text_mask = text == 0
44
+
45
+ if drop_text: # cfg for text
46
+ text = torch.zeros_like(text)
47
+
48
+ text = self.text_embed(text) # b nt -> b nt d
49
+
50
+ # sinus pos emb
51
+ batch_start = torch.zeros((text.shape[0],), dtype=torch.long)
52
+ batch_text_len = text.shape[1]
53
+ pos_idx = get_pos_embed_indices(batch_start, batch_text_len, max_pos=self.precompute_max_pos)
54
+ text_pos_embed = self.freqs_cis[pos_idx]
55
+
56
+ text = text + text_pos_embed
57
+
58
+ if self.mask_padding:
59
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
60
+
61
+ return text
62
+
63
+
64
+ # noised input & masked cond audio embedding
65
+
66
+
67
+ class AudioEmbedding(nn.Module):
68
+ def __init__(self, in_dim, out_dim):
69
+ super().__init__()
70
+ self.linear = nn.Linear(2 * in_dim, out_dim)
71
+ self.conv_pos_embed = ConvPositionEmbedding(out_dim)
72
+
73
+ def forward(self, x: float["b n d"], cond: float["b n d"], drop_audio_cond=False): # noqa: F722
74
+ if drop_audio_cond:
75
+ cond = torch.zeros_like(cond)
76
+ x = torch.cat((x, cond), dim=-1)
77
+ x = self.linear(x)
78
+ x = self.conv_pos_embed(x) + x
79
+ return x
80
+
81
+
82
+ # Transformer backbone using MM-DiT blocks
83
+
84
+
85
+ class MMDiT(nn.Module):
86
+ def __init__(
87
+ self,
88
+ *,
89
+ dim,
90
+ depth=8,
91
+ heads=8,
92
+ dim_head=64,
93
+ dropout=0.1,
94
+ ff_mult=4,
95
+ mel_dim=100,
96
+ text_num_embeds=256,
97
+ text_mask_padding=True,
98
+ qk_norm=None,
99
+ ):
100
+ super().__init__()
101
+
102
+ self.time_embed = TimestepEmbedding(dim)
103
+ self.text_embed = TextEmbedding(dim, text_num_embeds, mask_padding=text_mask_padding)
104
+ self.text_cond, self.text_uncond = None, None # text cache
105
+ self.audio_embed = AudioEmbedding(mel_dim, dim)
106
+
107
+ self.rotary_embed = RotaryEmbedding(dim_head)
108
+
109
+ self.dim = dim
110
+ self.depth = depth
111
+
112
+ self.transformer_blocks = nn.ModuleList(
113
+ [
114
+ MMDiTBlock(
115
+ dim=dim,
116
+ heads=heads,
117
+ dim_head=dim_head,
118
+ dropout=dropout,
119
+ ff_mult=ff_mult,
120
+ context_pre_only=i == depth - 1,
121
+ qk_norm=qk_norm,
122
+ )
123
+ for i in range(depth)
124
+ ]
125
+ )
126
+ self.norm_out = AdaLayerNorm_Final(dim) # final modulation
127
+ self.proj_out = nn.Linear(dim, mel_dim)
128
+
129
+ self.initialize_weights()
130
+
131
+ def initialize_weights(self):
132
+ # Zero-out AdaLN layers in MMDiT blocks:
133
+ for block in self.transformer_blocks:
134
+ nn.init.constant_(block.attn_norm_x.linear.weight, 0)
135
+ nn.init.constant_(block.attn_norm_x.linear.bias, 0)
136
+ nn.init.constant_(block.attn_norm_c.linear.weight, 0)
137
+ nn.init.constant_(block.attn_norm_c.linear.bias, 0)
138
+
139
+ # Zero-out output layers:
140
+ nn.init.constant_(self.norm_out.linear.weight, 0)
141
+ nn.init.constant_(self.norm_out.linear.bias, 0)
142
+ nn.init.constant_(self.proj_out.weight, 0)
143
+ nn.init.constant_(self.proj_out.bias, 0)
144
+
145
+ def clear_cache(self):
146
+ self.text_cond, self.text_uncond = None, None
147
+
148
+ def forward(
149
+ self,
150
+ x: float["b n d"], # nosied input audio # noqa: F722
151
+ cond: float["b n d"], # masked cond audio # noqa: F722
152
+ text: int["b nt"], # text # noqa: F722
153
+ time: float["b"] | float[""], # time step # noqa: F821 F722
154
+ drop_audio_cond, # cfg for cond audio
155
+ drop_text, # cfg for text
156
+ mask: bool["b n"] | None = None, # noqa: F722
157
+ cache=False,
158
+ ):
159
+ batch = x.shape[0]
160
+ if time.ndim == 0:
161
+ time = time.repeat(batch)
162
+
163
+ # t: conditioning (time), c: context (text + masked cond audio), x: noised input audio
164
+ t = self.time_embed(time)
165
+ if cache:
166
+ if drop_text:
167
+ if self.text_uncond is None:
168
+ self.text_uncond = self.text_embed(text, drop_text=True)
169
+ c = self.text_uncond
170
+ else:
171
+ if self.text_cond is None:
172
+ self.text_cond = self.text_embed(text, drop_text=False)
173
+ c = self.text_cond
174
+ else:
175
+ c = self.text_embed(text, drop_text=drop_text)
176
+ x = self.audio_embed(x, cond, drop_audio_cond=drop_audio_cond)
177
+
178
+ seq_len = x.shape[1]
179
+ text_len = text.shape[1]
180
+ rope_audio = self.rotary_embed.forward_from_seq_len(seq_len)
181
+ rope_text = self.rotary_embed.forward_from_seq_len(text_len)
182
+
183
+ for block in self.transformer_blocks:
184
+ c, x = block(x, c, t, mask=mask, rope=rope_audio, c_rope=rope_text)
185
+
186
+ x = self.norm_out(x, t)
187
+ output = self.proj_out(x)
188
+
189
+ return output
lemas_tts/model/backbones/prosody_encoder.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Prosody encoder backbone based on the Pretssel ECAPA-TDNN architecture.
3
+
4
+ This module provides:
5
+ - ProsodyEncoder: wraps an ECAPA-TDNN model to produce utterance-level
6
+ prosody embeddings from 80-dim FBANK features.
7
+ - extract_fbank_16k: utility to compute 80-bin FBANK from 16kHz audio.
8
+
9
+ It is self-contained (no fairseq2 dependency) and can be used inside
10
+ CFM or other models as a conditioning network.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from pathlib import Path
16
+ from typing import List, Optional, Tuple
17
+ import json
18
+
19
+ import torch
20
+ import torchaudio
21
+ from torch import Tensor
22
+ from torch import nn
23
+ from torch.nn import Conv1d, LayerNorm, Module, ModuleList, ReLU, Sigmoid, Tanh, init
24
+ import torch.nn.functional as F
25
+
26
+
27
+ AUDIO_SAMPLE_RATE = 16_000
28
+
29
+
30
+ class ECAPA_TDNN(Module):
31
+ """
32
+ ECAPA-TDNN core used in Pretssel prosody encoder.
33
+
34
+ Expects input features of shape (B, T, C) with C=80 and returns
35
+ a normalized embedding of shape (B, embed_dim).
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ channels: List[int],
41
+ kernel_sizes: List[int],
42
+ dilations: List[int],
43
+ attention_channels: int,
44
+ res2net_scale: int,
45
+ se_channels: int,
46
+ global_context: bool,
47
+ groups: List[int],
48
+ embed_dim: int,
49
+ input_dim: int,
50
+ ):
51
+ super().__init__()
52
+ assert len(channels) == len(kernel_sizes) == len(dilations)
53
+ self.channels = channels
54
+ self.embed_dim = embed_dim
55
+ self.blocks = ModuleList()
56
+
57
+ self.blocks.append(
58
+ TDNNBlock(
59
+ input_dim,
60
+ channels[0],
61
+ kernel_sizes[0],
62
+ dilations[0],
63
+ groups[0],
64
+ )
65
+ )
66
+
67
+ for i in range(1, len(channels) - 1):
68
+ self.blocks.append(
69
+ SERes2NetBlock(
70
+ channels[i - 1],
71
+ channels[i],
72
+ res2net_scale=res2net_scale,
73
+ se_channels=se_channels,
74
+ kernel_size=kernel_sizes[i],
75
+ dilation=dilations[i],
76
+ groups=groups[i],
77
+ )
78
+ )
79
+
80
+ self.mfa = TDNNBlock(
81
+ channels[-1],
82
+ channels[-1],
83
+ kernel_sizes[-1],
84
+ dilations[-1],
85
+ groups=groups[-1],
86
+ )
87
+
88
+ self.asp = AttentiveStatisticsPooling(
89
+ channels[-1],
90
+ attention_channels=attention_channels,
91
+ global_context=global_context,
92
+ )
93
+ self.asp_norm = LayerNorm(channels[-1] * 2, eps=1e-12)
94
+
95
+ self.fc = Conv1d(
96
+ in_channels=channels[-1] * 2,
97
+ out_channels=embed_dim,
98
+ kernel_size=1,
99
+ )
100
+
101
+ self.reset_parameters()
102
+
103
+ def reset_parameters(self) -> None:
104
+ def encoder_init(m: Module) -> None:
105
+ if isinstance(m, Conv1d):
106
+ init.xavier_uniform_(m.weight, init.calculate_gain("relu"))
107
+
108
+ self.apply(encoder_init)
109
+
110
+ def forward(
111
+ self,
112
+ x: Tensor,
113
+ padding_mask: Optional[Tensor] = None,
114
+ ) -> Tensor:
115
+ # x: (B, T, C)
116
+ x = x.transpose(1, 2) # (B, C, T)
117
+
118
+ xl = []
119
+ for layer in self.blocks:
120
+ x = layer(x, padding_mask=padding_mask)
121
+ xl.append(x)
122
+
123
+ x = torch.cat(xl[1:], dim=1)
124
+ x = self.mfa(x)
125
+
126
+ x = self.asp(x, padding_mask=padding_mask)
127
+ x = self.asp_norm(x.transpose(1, 2)).transpose(1, 2)
128
+
129
+ x = self.fc(x)
130
+
131
+ x = x.transpose(1, 2).squeeze(1) # (B, embed_dim)
132
+ return F.normalize(x, dim=-1)
133
+
134
+
135
+ class TDNNBlock(Module):
136
+ def __init__(
137
+ self,
138
+ in_channels: int,
139
+ out_channels: int,
140
+ kernel_size: int,
141
+ dilation: int,
142
+ groups: int = 1,
143
+ ):
144
+ super().__init__()
145
+ self.conv = Conv1d(
146
+ in_channels=in_channels,
147
+ out_channels=out_channels,
148
+ kernel_size=kernel_size,
149
+ dilation=dilation,
150
+ padding=dilation * (kernel_size - 1) // 2,
151
+ groups=groups,
152
+ )
153
+ self.activation = ReLU()
154
+ self.norm = LayerNorm(out_channels, eps=1e-12)
155
+
156
+ def forward(self, x: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor:
157
+ x = self.activation(self.conv(x))
158
+ return self.norm(x.transpose(1, 2)).transpose(1, 2)
159
+
160
+
161
+ class Res2NetBlock(Module):
162
+ def __init__(
163
+ self,
164
+ in_channels: int,
165
+ out_channels: int,
166
+ scale: int = 8,
167
+ kernel_size: int = 3,
168
+ dilation: int = 1,
169
+ ):
170
+ super().__init__()
171
+ assert in_channels % scale == 0
172
+ assert out_channels % scale == 0
173
+
174
+ in_channel = in_channels // scale
175
+ hidden_channel = out_channels // scale
176
+ self.blocks = ModuleList(
177
+ [
178
+ TDNNBlock(
179
+ in_channel,
180
+ hidden_channel,
181
+ kernel_size=kernel_size,
182
+ dilation=dilation,
183
+ )
184
+ for _ in range(scale - 1)
185
+ ]
186
+ )
187
+ self.scale = scale
188
+
189
+ def forward(self, x: Tensor) -> Tensor:
190
+ y = []
191
+ for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
192
+ if i == 0:
193
+ y_i = x_i
194
+ elif i == 1:
195
+ y_i = self.blocks[i - 1](x_i)
196
+ else:
197
+ y_i = self.blocks[i - 1](x_i + y_i)
198
+ y.append(y_i)
199
+ return torch.cat(y, dim=1)
200
+
201
+
202
+ class SEBlock(Module):
203
+ def __init__(
204
+ self,
205
+ in_channels: int,
206
+ se_channels: int,
207
+ out_channels: int,
208
+ ):
209
+ super().__init__()
210
+ self.conv1 = Conv1d(in_channels=in_channels, out_channels=se_channels, kernel_size=1)
211
+ self.relu = ReLU(inplace=True)
212
+ self.conv2 = Conv1d(in_channels=se_channels, out_channels=out_channels, kernel_size=1)
213
+ self.sigmoid = Sigmoid()
214
+
215
+ def forward(self, x: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor:
216
+ if padding_mask is not None:
217
+ # padding_mask: (B, T) with 1 for valid, 0 for pad
218
+ mask = padding_mask.unsqueeze(1) # (B, 1, T)
219
+ lengths = mask.sum(dim=2, keepdim=True)
220
+ s = (x * mask).sum(dim=2, keepdim=True) / torch.clamp(lengths, min=1.0)
221
+ else:
222
+ s = x.mean(dim=2, keepdim=True)
223
+
224
+ s = self.relu(self.conv1(s))
225
+ s = self.sigmoid(self.conv2(s))
226
+ return s * x
227
+
228
+
229
+ class AttentiveStatisticsPooling(Module):
230
+ def __init__(
231
+ self, channels: int, attention_channels: int = 128, global_context: bool = True
232
+ ):
233
+ super().__init__()
234
+ self.eps = 1e-12
235
+ self.global_context = global_context
236
+ if global_context:
237
+ self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
238
+ else:
239
+ self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
240
+
241
+ self.tanh = Tanh()
242
+ self.conv = Conv1d(in_channels=attention_channels, out_channels=channels, kernel_size=1)
243
+
244
+ def forward(self, x: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor:
245
+ # x: (N, C, L)
246
+ N, C, L = x.shape
247
+
248
+ def _compute_statistics(
249
+ x: Tensor, m: Tensor, dim: int = 2, eps: float = 1e-12
250
+ ) -> Tuple[Tensor, Tensor]:
251
+ mean = (m * x).sum(dim)
252
+ std = torch.sqrt((m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
253
+ return mean, std
254
+
255
+ if padding_mask is not None:
256
+ mask = padding_mask
257
+ else:
258
+ mask = torch.ones(N, L, device=x.device, dtype=x.dtype)
259
+ mask = mask.unsqueeze(1) # (N, 1, L)
260
+
261
+ if self.global_context:
262
+ total = mask.sum(dim=2, keepdim=True).to(x)
263
+ mean, std = _compute_statistics(x, mask / total)
264
+ mean = mean.unsqueeze(2).repeat(1, 1, L)
265
+ std = std.unsqueeze(2).repeat(1, 1, L)
266
+ attn = torch.cat([x, mean, std], dim=1)
267
+ else:
268
+ attn = x
269
+
270
+ attn = self.conv(self.tanh(self.tdnn(attn)))
271
+
272
+ attn = attn.masked_fill(mask == 0, float("-inf"))
273
+
274
+ attn = F.softmax(attn, dim=2)
275
+ mean, std = _compute_statistics(x, attn)
276
+ pooled_stats = torch.cat((mean, std), dim=1)
277
+ pooled_stats = pooled_stats.unsqueeze(2)
278
+ return pooled_stats
279
+
280
+
281
+ class SERes2NetBlock(Module):
282
+ def __init__(
283
+ self,
284
+ in_channels: int,
285
+ out_channels: int,
286
+ res2net_scale: int = 8,
287
+ se_channels: int = 128,
288
+ kernel_size: int = 1,
289
+ dilation: int = 1,
290
+ groups: int = 1,
291
+ ):
292
+ super().__init__()
293
+ self.out_channels = out_channels
294
+ self.tdnn1 = TDNNBlock(
295
+ in_channels,
296
+ out_channels,
297
+ kernel_size=1,
298
+ dilation=1,
299
+ groups=groups,
300
+ )
301
+ self.res2net_block = Res2NetBlock(
302
+ out_channels,
303
+ out_channels,
304
+ res2net_scale,
305
+ kernel_size,
306
+ dilation,
307
+ )
308
+ self.tdnn2 = TDNNBlock(
309
+ out_channels,
310
+ out_channels,
311
+ kernel_size=1,
312
+ dilation=1,
313
+ groups=groups,
314
+ )
315
+ self.se_block = SEBlock(out_channels, se_channels, out_channels)
316
+
317
+ self.shortcut = None
318
+ if in_channels != out_channels:
319
+ self.shortcut = Conv1d(
320
+ in_channels=in_channels,
321
+ out_channels=out_channels,
322
+ kernel_size=1,
323
+ )
324
+
325
+ def forward(self, x: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor:
326
+ residual = x
327
+ if self.shortcut:
328
+ residual = self.shortcut(x)
329
+
330
+ x = self.tdnn1(x)
331
+ x = self.res2net_block(x)
332
+ x = self.tdnn2(x)
333
+ x = self.se_block(x, padding_mask=padding_mask)
334
+ return x + residual
335
+
336
+
337
+ def extract_fbank_16k(audio_16k: Tensor) -> Tensor:
338
+ """
339
+ Compute 80-dim FBANK features from 16kHz audio.
340
+
341
+ Args:
342
+ audio_16k: Tensor of shape (T,) or (1, T)
343
+ Returns:
344
+ fbank: Tensor of shape (T_fbank, 80)
345
+ """
346
+ if audio_16k.ndim == 1:
347
+ audio_16k = audio_16k.unsqueeze(0)
348
+
349
+ # Ensure minimum length for kaldi.fbank window (default 25ms @16k -> 400 samples)
350
+ min_len = 400
351
+
352
+ if audio_16k.shape[-1] < min_len:
353
+ repeat_times = (min_len // audio_16k.shape[-1]) + 1
354
+ audio_16k = audio_16k.repeat(1, repeat_times) if audio_16k.dim() > 1 else audio_16k.repeat(repeat_times)
355
+
356
+ fbank = torchaudio.compliance.kaldi.fbank(
357
+ audio_16k,
358
+ num_mel_bins=80,
359
+ sample_frequency=AUDIO_SAMPLE_RATE,
360
+ )
361
+ return fbank
362
+
363
+
364
+ class ProsodyEncoder(nn.Module):
365
+ """
366
+ High-level wrapper for the Pretssel prosody encoder.
367
+
368
+ Usage:
369
+ encoder = ProsodyEncoder(cfg_path, ckpt_path, freeze=True)
370
+ emb = encoder(fbank_batch) # (B, 512)
371
+ """
372
+
373
+ def __init__(self, cfg_path: Path, ckpt_path: Path, freeze: bool = True):
374
+ super().__init__()
375
+ model_cfg = self._load_pretssel_model_cfg(cfg_path)
376
+ self.encoder = self._build_prosody_encoder(model_cfg)
377
+ self._load_prosody_encoder_state(self.encoder, ckpt_path)
378
+ if freeze:
379
+ for p in self.encoder.parameters():
380
+ p.requires_grad = False
381
+
382
+ @staticmethod
383
+ def _load_pretssel_model_cfg(cfg_path: Path) -> dict:
384
+ cfg = json.loads(cfg_path.read_text())
385
+ if "model" not in cfg:
386
+ raise ValueError(f"{cfg_path} does not contain a top-level 'model' key.")
387
+ return cfg["model"]
388
+
389
+ @staticmethod
390
+ def _build_prosody_encoder(model_cfg: dict) -> ECAPA_TDNN:
391
+ encoder = ECAPA_TDNN(
392
+ channels=model_cfg["prosody_channels"],
393
+ kernel_sizes=model_cfg["prosody_kernel_sizes"],
394
+ dilations=model_cfg["prosody_dilations"],
395
+ attention_channels=model_cfg["prosody_attention_channels"],
396
+ res2net_scale=model_cfg["prosody_res2net_scale"],
397
+ se_channels=model_cfg["prosody_se_channels"],
398
+ global_context=model_cfg["prosody_global_context"],
399
+ groups=model_cfg["prosody_groups"],
400
+ embed_dim=model_cfg["prosody_embed_dim"],
401
+ input_dim=model_cfg["input_feat_per_channel"],
402
+ )
403
+ return encoder
404
+
405
+ @staticmethod
406
+ def _load_prosody_encoder_state(model: Module, ckpt_path: Path) -> None:
407
+ state = torch.load(ckpt_path, map_location="cpu")
408
+ if isinstance(state, dict):
409
+ if all(isinstance(k, str) for k in state.keys()) and (
410
+ any(k.startswith("prosody_encoder.") for k in state.keys())
411
+ or any(k.startswith("prosody_encoder_model.") for k in state.keys())
412
+ ):
413
+ state = {
414
+ k.replace("prosody_encoder_model.", "", 1).replace("prosody_encoder.", "", 1): v
415
+ for k, v in state.items()
416
+ if k.startswith("prosody_encoder.") or k.startswith("prosody_encoder_model.")
417
+ }
418
+ missing, unexpected = model.load_state_dict(state, strict=False)
419
+ if missing or unexpected:
420
+ raise RuntimeError(
421
+ f"Error loading checkpoint {ckpt_path}: missing keys={missing}, "
422
+ f"unexpected keys={unexpected}"
423
+ )
424
+
425
+ def forward(self, fbank: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor:
426
+ """
427
+ Args:
428
+ fbank: Tensor of shape (B, T, 80)
429
+ padding_mask: Optional tensor of shape (B, T) with 1 for valid.
430
+ Returns:
431
+ emb: Tensor of shape (B, 512)
432
+ """
433
+ return self.encoder(fbank, padding_mask=padding_mask)
lemas_tts/model/backbones/unett.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+ from typing import Literal
12
+
13
+ import torch
14
+ from torch import nn
15
+ import torch.nn.functional as F
16
+
17
+ from x_transformers import RMSNorm
18
+ from x_transformers.x_transformers import RotaryEmbedding
19
+
20
+ from lemas_tts.model.modules import (
21
+ TimestepEmbedding,
22
+ ConvNeXtV2Block,
23
+ ConvPositionEmbedding,
24
+ Attention,
25
+ AttnProcessor,
26
+ FeedForward,
27
+ precompute_freqs_cis,
28
+ get_pos_embed_indices,
29
+ )
30
+
31
+
32
+ # Text embedding
33
+
34
+
35
+ class TextEmbedding(nn.Module):
36
+ def __init__(self, text_num_embeds, text_dim, mask_padding=True, conv_layers=0, conv_mult=2):
37
+ super().__init__()
38
+ self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim) # use 0 as filler token
39
+
40
+ self.mask_padding = mask_padding # mask filler and batch padding tokens or not
41
+
42
+ if conv_layers > 0:
43
+ self.extra_modeling = True
44
+ self.precompute_max_pos = 4096 # ~44s of 24khz audio
45
+ self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
46
+ self.text_blocks = nn.Sequential(
47
+ *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
48
+ )
49
+ else:
50
+ self.extra_modeling = False
51
+
52
+ def forward(self, text: int["b nt"], seq_len, drop_text=False): # noqa: F722
53
+ text = text + 1 # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
54
+ text = text[:, :seq_len] # curtail if character tokens are more than the mel spec tokens
55
+ batch, text_len = text.shape[0], text.shape[1]
56
+ text = F.pad(text, (0, seq_len - text_len), value=0)
57
+ if self.mask_padding:
58
+ text_mask = text == 0
59
+
60
+ if drop_text: # cfg for text
61
+ text = torch.zeros_like(text)
62
+
63
+ text = self.text_embed(text) # b n -> b n d
64
+
65
+ # possible extra modeling
66
+ if self.extra_modeling:
67
+ # sinus pos emb
68
+ batch_start = torch.zeros((batch,), dtype=torch.long)
69
+ pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
70
+ text_pos_embed = self.freqs_cis[pos_idx]
71
+ text = text + text_pos_embed
72
+
73
+ # convnextv2 blocks
74
+ if self.mask_padding:
75
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
76
+ for block in self.text_blocks:
77
+ text = block(text)
78
+ text = text.masked_fill(text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0)
79
+ else:
80
+ text = self.text_blocks(text)
81
+
82
+ return text
83
+
84
+
85
+ # noised input audio and context mixing embedding
86
+
87
+
88
+ class InputEmbedding(nn.Module):
89
+ def __init__(self, mel_dim, text_dim, out_dim):
90
+ super().__init__()
91
+ self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
92
+ self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
93
+
94
+ def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False): # noqa: F722
95
+ if drop_audio_cond: # cfg for cond audio
96
+ cond = torch.zeros_like(cond)
97
+
98
+ x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
99
+ x = self.conv_pos_embed(x) + x
100
+ return x
101
+
102
+
103
+ # Flat UNet Transformer backbone
104
+
105
+
106
+ class UNetT(nn.Module):
107
+ def __init__(
108
+ self,
109
+ *,
110
+ dim,
111
+ depth=8,
112
+ heads=8,
113
+ dim_head=64,
114
+ dropout=0.1,
115
+ ff_mult=4,
116
+ mel_dim=100,
117
+ text_num_embeds=256,
118
+ text_dim=None,
119
+ text_mask_padding=True,
120
+ qk_norm=None,
121
+ conv_layers=0,
122
+ pe_attn_head=None,
123
+ skip_connect_type: Literal["add", "concat", "none"] = "concat",
124
+ ):
125
+ super().__init__()
126
+ assert depth % 2 == 0, "UNet-Transformer's depth should be even."
127
+
128
+ self.time_embed = TimestepEmbedding(dim)
129
+ if text_dim is None:
130
+ text_dim = mel_dim
131
+ self.text_embed = TextEmbedding(
132
+ text_num_embeds, text_dim, mask_padding=text_mask_padding, conv_layers=conv_layers
133
+ )
134
+ self.text_cond, self.text_uncond = None, None # text cache
135
+ self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
136
+
137
+ self.rotary_embed = RotaryEmbedding(dim_head)
138
+
139
+ # transformer layers & skip connections
140
+
141
+ self.dim = dim
142
+ self.skip_connect_type = skip_connect_type
143
+ needs_skip_proj = skip_connect_type == "concat"
144
+
145
+ self.depth = depth
146
+ self.layers = nn.ModuleList([])
147
+
148
+ for idx in range(depth):
149
+ is_later_half = idx >= (depth // 2)
150
+
151
+ attn_norm = RMSNorm(dim)
152
+ attn = Attention(
153
+ processor=AttnProcessor(pe_attn_head=pe_attn_head),
154
+ dim=dim,
155
+ heads=heads,
156
+ dim_head=dim_head,
157
+ dropout=dropout,
158
+ qk_norm=qk_norm,
159
+ )
160
+
161
+ ff_norm = RMSNorm(dim)
162
+ ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
163
+
164
+ skip_proj = nn.Linear(dim * 2, dim, bias=False) if needs_skip_proj and is_later_half else None
165
+
166
+ self.layers.append(
167
+ nn.ModuleList(
168
+ [
169
+ skip_proj,
170
+ attn_norm,
171
+ attn,
172
+ ff_norm,
173
+ ff,
174
+ ]
175
+ )
176
+ )
177
+
178
+ self.norm_out = RMSNorm(dim)
179
+ self.proj_out = nn.Linear(dim, mel_dim)
180
+
181
+ def clear_cache(self):
182
+ self.text_cond, self.text_uncond = None, None
183
+
184
+ def forward(
185
+ self,
186
+ x: float["b n d"], # nosied input audio # noqa: F722
187
+ cond: float["b n d"], # masked cond audio # noqa: F722
188
+ text: int["b nt"], # text # noqa: F722
189
+ time: float["b"] | float[""], # time step # noqa: F821 F722
190
+ drop_audio_cond, # cfg for cond audio
191
+ drop_text, # cfg for text
192
+ mask: bool["b n"] | None = None, # noqa: F722
193
+ cache=False,
194
+ ):
195
+ batch, seq_len = x.shape[0], x.shape[1]
196
+ if time.ndim == 0:
197
+ time = time.repeat(batch)
198
+
199
+ # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
200
+ t = self.time_embed(time)
201
+ if cache:
202
+ if drop_text:
203
+ if self.text_uncond is None:
204
+ self.text_uncond = self.text_embed(text, seq_len, drop_text=True)
205
+ text_embed = self.text_uncond
206
+ else:
207
+ if self.text_cond is None:
208
+ self.text_cond = self.text_embed(text, seq_len, drop_text=False)
209
+ text_embed = self.text_cond
210
+ else:
211
+ text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
212
+ x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
213
+
214
+ # postfix time t to input x, [b n d] -> [b n+1 d]
215
+ x = torch.cat([t.unsqueeze(1), x], dim=1) # pack t to x
216
+ if mask is not None:
217
+ mask = F.pad(mask, (1, 0), value=1)
218
+
219
+ rope = self.rotary_embed.forward_from_seq_len(seq_len + 1)
220
+
221
+ # flat unet transformer
222
+ skip_connect_type = self.skip_connect_type
223
+ skips = []
224
+ for idx, (maybe_skip_proj, attn_norm, attn, ff_norm, ff) in enumerate(self.layers):
225
+ layer = idx + 1
226
+
227
+ # skip connection logic
228
+ is_first_half = layer <= (self.depth // 2)
229
+ is_later_half = not is_first_half
230
+
231
+ if is_first_half:
232
+ skips.append(x)
233
+
234
+ if is_later_half:
235
+ skip = skips.pop()
236
+ if skip_connect_type == "concat":
237
+ x = torch.cat((x, skip), dim=-1)
238
+ x = maybe_skip_proj(x)
239
+ elif skip_connect_type == "add":
240
+ x = x + skip
241
+
242
+ # attention and feedforward blocks
243
+ x = attn(attn_norm(x), rope=rope, mask=mask) + x
244
+ x = ff(ff_norm(x)) + x
245
+
246
+ assert len(skips) == 0
247
+
248
+ x = self.norm_out(x)[:, 1:, :] # unpack t from x
249
+
250
+ return self.proj_out(x)
lemas_tts/model/cfm.py ADDED
@@ -0,0 +1,899 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from random import random
13
+ import random as _random
14
+ from typing import Callable, Dict, OrderedDict
15
+ import math
16
+ from pathlib import Path
17
+
18
+ import torch
19
+ import torch.nn.functional as F
20
+ import torchaudio
21
+ from torch import nn
22
+ from torch.nn.utils.rnn import pad_sequence
23
+ from torchdiffeq import odeint
24
+
25
+ from lemas_tts.model.modules import MelSpec
26
+ from lemas_tts.model.modules import MIEsitmator, AccentClassifier, grad_reverse
27
+ from lemas_tts.model.backbones.ecapa_tdnn import ECAPA_TDNN
28
+ from lemas_tts.model.backbones.prosody_encoder import ProsodyEncoder, extract_fbank_16k
29
+ from lemas_tts.model.utils import (
30
+ default,
31
+ exists,
32
+ lens_to_mask,
33
+ list_str_to_idx,
34
+ list_str_to_tensor,
35
+ mask_from_frac_lengths,
36
+ )
37
+
38
+
39
+ def clip_and_shuffle(mel, mel_len, sample_rate=24000, hop_length=256, ratio=None):
40
+ """
41
+ Randomly clip a mel-spectrogram segment and shuffle 1-second chunks to
42
+ create an accent-invariant conditioning segment.
43
+
44
+ This is a inference-time utility used by the accent GRL path.
45
+
46
+ Args:
47
+ mel: [n_mels, T]
48
+ mel_len: int, original mel length (T)
49
+ """
50
+ frames_per_second = int(sample_rate / hop_length) # ≈ 94 frames / second
51
+
52
+ # ---- 1. Randomly crop 25%~75% of the original length (or ratio * length) ----
53
+ total_len = mel_len
54
+ if not ratio:
55
+ seg_len = _random.randint(int(0.25 * total_len), int(0.75 * total_len))
56
+ else:
57
+ seg_len = int(total_len * ratio)
58
+ start = _random.randint(0, max(0, total_len - seg_len))
59
+ mel_seg = mel[:, start : start + seg_len]
60
+
61
+ # ---- 2. Split into ~1-second chunks ----
62
+ n_chunks = (mel_seg.size(1) + frames_per_second - 1) // frames_per_second
63
+ chunks = []
64
+ for i in range(n_chunks):
65
+ chunk = mel_seg[:, i * frames_per_second : (i + 1) * frames_per_second]
66
+ chunks.append(chunk)
67
+
68
+ # ---- 3. Shuffle chunk order ----
69
+ _random.shuffle(chunks)
70
+ shuffled_mel = torch.cat(chunks, dim=1)
71
+
72
+ # ---- 4. Repeat random chunks until reaching original length ----
73
+ if shuffled_mel.size(1) < total_len:
74
+ repeat_chunks = []
75
+ while sum(c.size(1) for c in repeat_chunks) < total_len:
76
+ repeat_chunks.append(_random.choice(chunks))
77
+ shuffled_mel = torch.cat([shuffled_mel] + repeat_chunks, dim=1)
78
+
79
+ # ---- 5. Trim to exactly mel_len ----
80
+ shuffled_mel = shuffled_mel[:, :total_len]
81
+ assert shuffled_mel.shape == mel.shape, f"shuffled_mel.shape != mel.shape: {shuffled_mel.shape} != {mel.shape}"
82
+
83
+ return shuffled_mel
84
+
85
+ class CFM(nn.Module):
86
+ def __init__(
87
+ self,
88
+ transformer: nn.Module,
89
+ sigma=0.0,
90
+ odeint_kwargs: dict = dict(
91
+ # atol = 1e-5,
92
+ # rtol = 1e-5,
93
+ method="euler" # 'midpoint'
94
+ ),
95
+ audio_drop_prob=0.3,
96
+ text_drop_prob=0.1,
97
+ num_channels=None,
98
+ mel_spec_module: nn.Module | None = None,
99
+ mel_spec_kwargs: dict = dict(),
100
+ frac_lengths_mask: tuple[float, float] = (0.7, 1.0),
101
+ vocab_char_map: dict[str:int] | None = None,
102
+ use_ctc_loss: bool = False,
103
+ use_spk_enc: bool = False,
104
+ use_prosody_encoder: bool = False,
105
+ prosody_cfg_path: str | None = None,
106
+ prosody_ckpt_path: str | None = None,
107
+ ):
108
+ super().__init__()
109
+
110
+ self.frac_lengths_mask = frac_lengths_mask
111
+
112
+ # mel spec
113
+ self.mel_spec = default(mel_spec_module, MelSpec(**mel_spec_kwargs))
114
+ num_channels = default(num_channels, self.mel_spec.n_mel_channels)
115
+ self.num_channels = num_channels
116
+
117
+ # classifier-free guidance
118
+ self.audio_drop_prob = audio_drop_prob
119
+ self.text_drop_prob = text_drop_prob
120
+
121
+ # transformer
122
+ self.transformer = transformer
123
+ dim = transformer.dim
124
+ self.dim = dim
125
+
126
+ # conditional flow related
127
+ self.sigma = sigma
128
+
129
+ # sampling related
130
+ self.odeint_kwargs = odeint_kwargs
131
+
132
+ # vocab map for tokenization
133
+ self.vocab_char_map = vocab_char_map
134
+
135
+ # Prosody encoder (Pretssel ECAPA-TDNN)
136
+ self.use_prosody_encoder = (
137
+ use_prosody_encoder and prosody_cfg_path is not None and prosody_ckpt_path is not None
138
+ )
139
+ if self.use_prosody_encoder:
140
+ cfg_path = Path(prosody_cfg_path)
141
+ ckpt_path = Path(prosody_ckpt_path)
142
+ self.prosody_encoder = ProsodyEncoder(cfg_path, ckpt_path, freeze=True)
143
+ # 512-d prosody -> mel channel dimension
144
+ self.prosody_to_mel = nn.Linear(512, self.num_channels)
145
+ self.prosody_dropout = nn.Dropout(p=0.2)
146
+ else:
147
+ self.prosody_encoder = None
148
+
149
+ # Speaker encoder
150
+ self.use_spk_enc = use_spk_enc
151
+ if use_spk_enc:
152
+ self.speaker_encoder = ECAPA_TDNN(
153
+ self.num_channels,
154
+ self.dim,
155
+ channels=[512, 512, 512, 512, 1536],
156
+ kernel_sizes=[5, 3, 3, 3, 1],
157
+ dilations=[1, 2, 3, 4, 1],
158
+ attention_channels=128,
159
+ res2net_scale=4,
160
+ se_channels=128,
161
+ global_context=True,
162
+ batch_norm=True,
163
+ )
164
+ # self.load_partial_weights(self.speaker_encoder, "/cto_labs/vistring/zhaozhiyuan/outputs/F5-TTS/pretrain/speaker.bin", device="cpu")
165
+
166
+ self.use_ctc_loss = use_ctc_loss
167
+ if use_ctc_loss:
168
+ # print("vocab_char_map:", len(vocab_char_map)+1, "dim:", dim, "mel_spec_kwargs:",mel_spec_kwargs)
169
+ self.ctc = MIEsitmator(len(self.vocab_char_map), self.num_channels, self.dim, dropout=self.text_drop_prob)
170
+
171
+ self.accent_classifier = AccentClassifier(input_dim=self.num_channels, hidden_dim=self.dim, num_accents=12)
172
+ self.accent_criterion = nn.CrossEntropyLoss()
173
+
174
+ def load_partial_weights(self, model: nn.Module,
175
+ ckpt_path: str,
176
+ device="cpu",
177
+ verbose=True) -> int:
178
+ """
179
+ 仅加载形状匹配的参数,其余跳过。
180
+ 返回成功加载的参数数量。
181
+ """
182
+ state_dict = torch.load(ckpt_path, map_location=device)
183
+ model_dict = model.state_dict()
184
+
185
+ ok_count = 0
186
+ new_dict: OrderedDict[str, torch.Tensor] = OrderedDict()
187
+
188
+ for k, v in state_dict.items():
189
+ if k in model_dict and v.shape == model_dict[k].shape:
190
+ new_dict[k] = v
191
+ ok_count += 1
192
+ else:
193
+ if verbose:
194
+ print(f"[SKIP] {k} ckpt:{v.shape} model:{model_dict[k].shape if k in model_dict else 'N/A'}")
195
+
196
+ model_dict.update(new_dict)
197
+ model.load_state_dict(model_dict)
198
+ if verbose:
199
+ print(f"=> 成功加载 {ok_count}/{len(state_dict)} 个参数")
200
+ return ok_count
201
+
202
+ @property
203
+ def device(self):
204
+ return next(self.parameters()).device
205
+
206
+ @torch.no_grad()
207
+ def sample(
208
+ self,
209
+ cond: float["b n d"] | float["b nw"], # noqa: F722
210
+ text: int["b nt"] | list[str], # noqa: F722
211
+ duration: int | int["b"], # noqa: F821
212
+ *,
213
+ lens: int["b"] | None = None, # noqa: F821
214
+ steps=32,
215
+ cfg_strength=1.0,
216
+ sway_sampling_coef=None,
217
+ seed: int | None = None,
218
+ max_duration=4096,
219
+ vocoder: Callable[[float["b d n"]], float["b nw"]] | None = None, # noqa: F722
220
+ no_ref_audio=False,
221
+ duplicate_test=False,
222
+ t_inter=0.1,
223
+ edit_mask=None,
224
+ use_acc_grl = True,
225
+ use_prosody_encoder = True,
226
+ ref_ratio = 1,
227
+ ):
228
+ self.eval()
229
+
230
+ # raw wave -> mel, keep a copy for prosody encoder if available
231
+ raw_audio = None
232
+ if cond.ndim == 2:
233
+ raw_audio = cond.clone() # (B, nw)
234
+ cond = self.mel_spec(cond)
235
+ cond = cond.permute(0, 2, 1)
236
+ assert cond.shape[-1] == self.num_channels
237
+
238
+ cond = cond.to(next(self.parameters()).dtype)
239
+ cond_mean = cond.mean(dim=1, keepdim=True)
240
+ batch, cond_seq_len, device = *cond.shape[:2], cond.device
241
+ if not exists(lens):
242
+ lens = torch.full((batch,), cond_seq_len, device=device, dtype=torch.long)
243
+
244
+ # optional global prosody conditioning at inference (one embedding per sample)
245
+ prosody_mel_cond = None
246
+ prosody_text_cond = None
247
+ prosody_embeds = None
248
+ if self.prosody_encoder is not None and raw_audio is not None and use_prosody_encoder:
249
+ embeds = []
250
+ for b in range(batch):
251
+ audio_b = raw_audio[b].unsqueeze(0) # (1, nw)
252
+ src_sr = self.mel_spec.target_sample_rate
253
+ if src_sr != 16_000:
254
+ audio_16k = torchaudio.functional.resample(
255
+ audio_b, src_sr, 16_000
256
+ ).squeeze(0)
257
+ else:
258
+ audio_16k = audio_b.squeeze(0)
259
+ fbank = extract_fbank_16k(audio_16k)
260
+ fbank = fbank.unsqueeze(0).to(device=device, dtype=cond.dtype)
261
+ emb = self.prosody_encoder(fbank, padding_mask=None)[0] # (512,)
262
+ embeds.append(emb)
263
+ prosody_embeds = torch.stack(embeds, dim=0) # (B, 512)
264
+ # broadcast along mel and text
265
+ prosody_mel_cond = prosody_embeds[:, None, :].expand(-1, cond_seq_len, -1)
266
+
267
+ if use_acc_grl:
268
+ # rand_mel = clip_and_shuffle(cond.permute(0, 2, 1).squeeze(0), cond.shape[1])
269
+ # rand_mel = rand_mel.unsqueeze(0).permute(0, 2, 1)
270
+ # assert rand_mel.shape == cond.shape, f"Shape diff: rand_mel.shape: {rand_mel.shape}, cond.shape: {cond.shape}"
271
+ # cond_grl = grad_reverse(rand_mel, lambda_=1.0)
272
+
273
+ if ref_ratio < 1:
274
+ rand_mel = clip_and_shuffle(cond.permute(0, 2, 1).squeeze(0), cond.shape[1], ratio=ref_ratio)
275
+ rand_mel = rand_mel.unsqueeze(0).permute(0, 2, 1)
276
+ assert rand_mel.shape == cond.shape, f"Shape diff: rand_mel.shape: {rand_mel.shape}, cond.shape: {cond.shape}"
277
+ cond_grl = grad_reverse(rand_mel, lambda_=1.0)
278
+ else:
279
+ cond_grl = grad_reverse(cond, lambda_=1.0)
280
+ # print("cond:", cond.shape, cond.mean(), cond.max(), cond.min(), "rand_mel:", rand_mel.mean(), rand_mel.max(), rand_mel.min(), "cond_grl:", cond_grl.mean(), cond_grl.max(), cond_grl.min())
281
+
282
+ # text
283
+
284
+ if isinstance(text, list):
285
+ if exists(self.vocab_char_map):
286
+ text = list_str_to_idx(text, self.vocab_char_map).to(device)
287
+ else:
288
+ text = list_str_to_tensor(text).to(device)
289
+ assert text.shape[0] == batch
290
+
291
+ # duration
292
+
293
+ cond_mask = lens_to_mask(lens)
294
+ if edit_mask is not None:
295
+ cond_mask = cond_mask & edit_mask
296
+
297
+ if isinstance(duration, int):
298
+ duration = torch.full((batch,), duration, device=device, dtype=torch.long)
299
+
300
+ duration = torch.maximum(
301
+ torch.maximum((text != -1).sum(dim=-1), lens) + 1, duration
302
+ ) # duration at least text/audio prompt length plus one token, so something is generated
303
+ # clamp and convert max_duration to python int for padding ops
304
+ duration = duration.clamp(max=max_duration)
305
+ max_duration = int(duration.amax().item())
306
+
307
+ # duplicate test corner for inner time step oberservation
308
+ if duplicate_test:
309
+ test_cond = F.pad(cond, (0, 0, cond_seq_len, max_duration - 2 * cond_seq_len), value=0.0)
310
+
311
+ cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0)
312
+
313
+ if prosody_mel_cond is not None:
314
+ prosody_mel_cond = F.pad(
315
+ prosody_mel_cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0
316
+ )
317
+ prosody_mel_proj = self.prosody_to_mel(prosody_mel_cond)
318
+ cond = cond + prosody_mel_proj
319
+
320
+ if no_ref_audio:
321
+ random_cond = torch.randn_like(cond) * 0.1 + cond_mean
322
+ random_cond = random_cond / random_cond.mean(dim=1, keepdim=True) * cond_mean
323
+ print("cond:", cond.mean(), cond.max(), cond.min(), "random_cond:", random_cond.mean(), random_cond.max(), random_cond.min(), "mean_cond:", cond_mean.shape)
324
+ cond = random_cond
325
+
326
+ cond_mask = F.pad(cond_mask, (0, max_duration - cond_mask.shape[-1]), value=False)
327
+ cond_mask = cond_mask.unsqueeze(-1)
328
+
329
+ if use_acc_grl:
330
+ cond_grl = F.pad(cond_grl, (0, 0, 0, max_duration - cond_seq_len), value=0.0)
331
+
332
+
333
+ step_cond = torch.where(cond_mask, cond, torch.zeros_like(cond)) # allow direct control (cut cond audio) with lens passed in
334
+
335
+
336
+ if batch > 1:
337
+ mask = lens_to_mask(duration)
338
+ else: # save memory and speed up, as single inference need no mask currently
339
+ mask = None
340
+
341
+ # neural ode
342
+
343
+ def compute_sway_max(steps: int,
344
+ t_start: float = 0.0,
345
+ dtype=torch.float32,
346
+ min_ratio: float | None = None,
347
+ safety_factor: float = 0.5) -> float:
348
+ """
349
+ Compute a safe upper bound for sway_sampling_coef given steps and t_start.
350
+
351
+ - steps: number of ODE steps
352
+ - t_start: start time in [0,1)
353
+ - dtype: torch dtype (for machine eps)
354
+ - min_ratio: smallest distinguishable dt^p (if None, use conservative default)
355
+ - safety_factor: scale down the theoretical maximum to be safe
356
+ """
357
+ assert 0.0 <= t_start < 1.0
358
+ dt = (1.0 - t_start) / max(1, steps)
359
+ eps = torch.finfo(dtype).eps
360
+
361
+ if min_ratio is None:
362
+ # conservative default: ~100 * eps (float32 -> ~1e-5)
363
+ min_ratio = max(1e-9, 1e2 * float(eps))
364
+
365
+ if dt >= 0.9:
366
+ p_max = 1.0 + 10.0
367
+ else:
368
+ # solve dt^p >= min_ratio => p <= log(min_ratio)/log(dt)
369
+ p_max = math.log(min_ratio) / math.log(dt)
370
+
371
+ sway_max = max(0.0, p_max - 1.0)
372
+ sway_max = sway_max * float(safety_factor)
373
+ return torch.tensor(sway_max, device=device, dtype=dtype)
374
+
375
+ # prepare text-side prosody conditioning if embeddings available
376
+ if prosody_embeds is not None:
377
+ text_len = text.shape[1]
378
+ prosody_text_cond = prosody_embeds[:, None, :].expand(-1, text_len, -1)
379
+ else:
380
+ prosody_text_cond = None
381
+
382
+ def fn(t, x):
383
+ # at each step, conditioning is fixed
384
+ # if use_spk_enc:
385
+ # mix_cond = t * cond + (1-t) * spk_emb
386
+ # step_cond = torch.where(cond_mask, mix_cond, torch.zeros_like(mix_cond))
387
+ if use_acc_grl:
388
+ step_cond = torch.where(cond_mask, cond_grl, torch.zeros_like(cond_grl))
389
+ else:
390
+ step_cond = torch.where(cond_mask, cond, torch.zeros_like(cond))
391
+
392
+ # predict flow
393
+ pred = self.transformer(
394
+ x=x,
395
+ cond=step_cond,
396
+ text=text,
397
+ time=t,
398
+ mask=mask,
399
+ drop_audio_cond=False,
400
+ drop_text=False,
401
+ cache=True,
402
+ prosody_text=prosody_text_cond,
403
+ )
404
+ if cfg_strength < 1e-5:
405
+ return pred
406
+
407
+ null_pred = self.transformer(
408
+ x=x,
409
+ cond=step_cond,
410
+ text=text,
411
+ time=t,
412
+ mask=mask,
413
+ drop_audio_cond=True,
414
+ drop_text=True,
415
+ cache=True,
416
+ prosody_text=prosody_text_cond,
417
+ )
418
+ # cfg_t = cfg_strength * torch.cos(0.5 * torch.pi * t)
419
+ # cfg_t = cfg_strength * (1 - t)
420
+ cfg_t = cfg_strength * ((1 - t) ** 2)
421
+ # print("t:", t, "cfg_t:", cfg_t)
422
+ res = pred + (pred - null_pred) * cfg_t
423
+ # print("t:", t.item(), "\tres:", res.shape, res.mean().item(), res.max().item(), res.min().item(), "\tpred:", pred.mean().item(), pred.max().item(), pred.min().item(), "\tnull_pred:", null_pred.mean().item(), null_pred.max().item(), null_pred.min().item(), "\tcfg_t:", cfg_t.item())
424
+ res = res.clamp(-20, 20)
425
+ return res
426
+
427
+ # noise input
428
+ # to make sure batch inference result is same with different batch size, and for sure single inference
429
+ # still some difference maybe due to convolutional layers
430
+ y0 = []
431
+ for dur in duration:
432
+ if exists(seed):
433
+ torch.manual_seed(seed)
434
+ y0.append(torch.randn(dur, self.num_channels, device=self.device, dtype=step_cond.dtype))
435
+ y0 = pad_sequence(y0, padding_value=0, batch_first=True)
436
+
437
+ t_start = 0
438
+
439
+ # duplicate test corner for inner time step oberservation
440
+ if duplicate_test:
441
+ t_start = t_inter
442
+ y0 = (1 - t_start) * y0 + t_start * test_cond
443
+ steps = int(steps * (1 - t_start))
444
+
445
+ t = torch.linspace(t_start, 1, int(steps + 1), device=self.device, dtype=step_cond.dtype)
446
+
447
+ sway_max = compute_sway_max(steps, t_start=t_start, dtype=step_cond.dtype, min_ratio=1e-9, safety_factor=0.7)
448
+ if sway_sampling_coef is not None:
449
+ sway_sampling_coef = min(sway_max, sway_sampling_coef)
450
+ # t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
451
+ t = t ** (1 + sway_sampling_coef)
452
+ else:
453
+ t = t ** (1 + sway_max)
454
+ # print("t:",t, "sway_max:", sway_max, "sway_sampling_coef:", sway_sampling_coef)
455
+
456
+ trajectory = odeint(fn, y0, t, **self.odeint_kwargs)
457
+ self.transformer.clear_cache()
458
+
459
+ sampled = trajectory[-1]
460
+ out = sampled
461
+ out = torch.where(cond_mask, cond, out)
462
+
463
+ # out生成的部分,或者说pad补0的部分,单独计算mean, 然后和cond的mean做对齐(乘以系数,两个的均值要差不多)
464
+ if no_ref_audio:
465
+ out_mean = out[:,cond_seq_len:,:].mean(dim=1, keepdim=True)
466
+ out[:,cond_seq_len:,:] = out[:,cond_seq_len:,:] - (out_mean - cond_mean)
467
+ # print("out_mean:", out_mean.shape, out_mean.mean(), "cond_mean:", cond_mean.shape, cond_mean.mean(), "out:", out[:,cond_seq_len:,:].shape, out[:,cond_seq_len:,:].mean().item(), out[:,cond_seq_len:,:].max().item(), out[:,cond_seq_len:,:].min().item())
468
+
469
+ if exists(vocoder):
470
+ out = out.permute(0, 2, 1)
471
+ out = vocoder(out)
472
+ # print("out:", out.shape, "trajectory:", trajectory.shape)
473
+ return out, trajectory
474
+
475
+
476
+ def info_nce_speaker(self,
477
+ e_gt: torch.Tensor,
478
+ e_pred: torch.Tensor,
479
+ temperature: float = 0.1):
480
+ """
481
+ InfoNCE loss for speaker encoder training.
482
+ 同一条样本的 e_gt 与 e_pred 互为正例,其余均为负例。
483
+
484
+ Args:
485
+ temperature: 温度缩放 τ
486
+
487
+ Returns:
488
+ loss: 标量 tensor,可 backward
489
+ """
490
+ B = e_gt.size(0)
491
+ # 2. L2 归一化
492
+ e_gt = F.normalize(e_gt, dim=1)
493
+ e_pred = F.normalize(e_pred, dim=1)
494
+
495
+ # 3. 计算 B×B 相似度矩阵(pred 对 gt)
496
+ logits = torch.einsum('bd,cd->bc', e_pred, e_gt) / temperature # [B, B]
497
+
498
+ # 4. 正例标签正好是对角线
499
+ labels = torch.arange(B, device=logits.device)
500
+
501
+ # 5. InfoNCE = cross-entropy over in-batch negatives
502
+ loss = F.cross_entropy(logits, labels)
503
+ return loss
504
+
505
+
506
+ def forward_old(
507
+ self,
508
+ batchs: Dict[str, torch.Tensor],
509
+ # inp: float["b n d"] | float["b nw"], # mel or raw wave # noqa: F722
510
+ # text: int["b nt"] | list[str], # noqa: F722
511
+ *,
512
+ # lens: int["b"] | None = None, # noqa: F821
513
+ noise_scheduler: str | None = None,
514
+ ):
515
+
516
+ inp = batchs["mel"].permute(0, 2, 1)
517
+ lens = batchs["mel_lengths"]
518
+
519
+ rand_mel = batchs["rand_mel"].permute(0, 2, 1)
520
+
521
+ text = batchs["text"]
522
+ target_text_lengths = torch.tensor([len(x) for x in text], device=inp.device)
523
+
524
+ langs = batchs["langs"]
525
+
526
+ # print("inp:", inp.shape, "rand_mel:", rand_mel.shape, "lens:", lens, "target_text_lengths:", target_text_lengths, "langs:", langs)
527
+
528
+ # handle raw wave
529
+ if inp.ndim == 2:
530
+ inp = self.mel_spec(inp)
531
+ inp = inp.permute(0, 2, 1)
532
+ assert inp.shape[-1] == self.num_channels
533
+
534
+ batch, seq_len, dtype, device, _σ1 = *inp.shape[:2], inp.dtype, self.device, self.sigma
535
+ # print("inp_shape:", inp.shape, inp.max(), inp.min(), "dtype:", dtype, "device:", device, "σ1:", _σ1)
536
+
537
+ # handle text as string
538
+ if isinstance(text, list):
539
+ if exists(self.vocab_char_map):
540
+ text = list_str_to_idx(text, self.vocab_char_map).to(device)
541
+ else:
542
+ text = list_str_to_tensor(text).to(device)
543
+ assert text.shape[0] == batch
544
+
545
+ # lens and mask
546
+ if not exists(lens):
547
+ lens = torch.full((batch,), seq_len, device=device)
548
+
549
+ mask = lens_to_mask(lens, length=seq_len) # useless here, as collate_fn will pad to max length in batch
550
+
551
+ # get a random span to mask out for training conditionally
552
+ frac_lengths = torch.zeros((batch,), device=self.device).float().uniform_(*self.frac_lengths_mask)
553
+ rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
554
+
555
+ if exists(mask):
556
+ rand_span_mask &= mask
557
+
558
+ # mel is x1
559
+ x1 = inp
560
+
561
+ # x0 is gaussian noise
562
+ x0 = torch.randn_like(x1)
563
+
564
+ # time step
565
+ time = torch.rand((batch,), dtype=dtype, device=self.device)
566
+ # TODO. noise_scheduler
567
+
568
+ # sample xt (φ_t(x) in the paper)
569
+ t = time.unsqueeze(-1).unsqueeze(-1)
570
+ φ = (1 - t) * x0 + t * x1
571
+ flow = x1 - x0
572
+
573
+ # cond = torch.where(rand_span_mask[..., None], torch.zeros_like(rand_mel), rand_mel)
574
+ cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1)
575
+
576
+ # print("seq_len:", seq_len, "lens:", lens)
577
+ if self.use_spk_enc: # 50%的概率使用spk_emb
578
+
579
+ spk_emb = self.speaker_encoder(rand_mel, lens)
580
+ # global_emb: [batch, 1, dim] -> 重复扩展到 [batch, seq_len, dim]
581
+ spk_emb = spk_emb.unsqueeze(1).expand_as(x1)
582
+ # print("spk_emb_shape:", spk_emb.shape)
583
+ # 应用mask操作
584
+ cond = torch.where(rand_span_mask[..., None], torch.zeros_like(spk_emb), spk_emb)
585
+ # assert cond.shape[0] == batch, "speaker encoder batch size mismatch"
586
+ # print("x1.shape:", x1.shape, "cond_shape:", cond.shape)
587
+
588
+ # 给一个随机数,把spk_emb * 随机数,再加上原来的cond *(1-随机数)
589
+ rand_num = torch.rand((batch, 1, 1), dtype=dtype, device=self.device)
590
+ cond = cond * rand_num + spk_emb * (1 - rand_num)
591
+
592
+ cond_grl = grad_reverse(cond, lambda_=1.0)
593
+
594
+ # print("inp_shape:", inp.shape, "rand_span_mask:", rand_span_mask.shape)
595
+
596
+ # # # transformer and cfg training with a drop rate
597
+ # drop_audio_cond = random() < self.audio_drop_prob # p_drop in voicebox paper
598
+ # drop_text_cond = random() < self.text_drop_prob # p_drop in voicebox paper
599
+ drop_audio_cond = random() < self.audio_drop_prob # p_drop in voicebox paper
600
+ if random() < self.text_drop_prob: # p_uncond in voicebox paper
601
+ drop_audio_cond = True
602
+ drop_text_cond = True
603
+ else:
604
+ drop_text_cond = False
605
+
606
+ # print("drop_audio_cond:", drop_audio_cond, "drop_text_cond:", drop_text_cond)
607
+ # if want rigorously mask out padding, record in collate_fn in dataset.py, and pass in here
608
+ # adding mask will use more memory, thus also need to adjust batchsampler with scaled down threshold for long sequences
609
+ pred = self.transformer(x=φ, cond=cond_grl, text=text, time=time, drop_audio_cond=drop_audio_cond, drop_text=drop_text_cond)
610
+
611
+ # flow matching loss
612
+ pred_clamp = pred.float().clamp(-20, 20)
613
+ loss = F.mse_loss(pred_clamp, flow, reduction="none")
614
+ loss = loss[rand_span_mask] # [N]
615
+
616
+ # # # 1. 全局截断:>2 或 NaN → 0(全局)
617
+ # print("mse loss shape:", loss.shape, "loss max:", loss.max(), "loss min:", loss.min(), target_text_lengths[0])
618
+
619
+ # # 2. 统计非NaN值的百分比
620
+ # valid_mask = ~torch.isnan(loss)
621
+ # total_count = loss.numel() # 总元素数量(所有维度)
622
+ # valid_count = valid_mask.sum().item() # 非NaN元素数量
623
+ # valid_percentage = (valid_count / total_count) * 100
624
+ # print(f"mse loss: total_count: {total_count}", f"valid_count: {valid_count}", f"valid_percentage: {valid_percentage:.2f}%")
625
+
626
+ # valid_loss = loss[~torch.isnan(loss)]
627
+ loss = torch.where(torch.isnan(loss) | (loss > 300.0), 300.0, loss)
628
+ loss = loss.mean()
629
+
630
+ # loss = torch.tanh(torch.log1p(loss.mean())) # 对数缩放
631
+ # if len(valid_loss) > 0:
632
+ # clipped_loss = torch.clamp(valid_loss, max=150)
633
+ # loss = torch.tanh(torch.log1p(clipped_loss.mean())) # 对数缩放
634
+ # else:
635
+ # loss = torch.tensor(0.0, device=pred.device)
636
+
637
+
638
+ accent_logits = self.accent_classifier(cond_grl)
639
+ accent_logits_mean = accent_logits.mean(dim=1)
640
+ lang_labels = langs.to(accent_logits.device).long()
641
+ # print("langs:", lang_labels, "accent_logits:", accent_logits.shape, "accent_logits_mean:", accent_logits_mean.shape)
642
+ accent_loss = self.accent_criterion(accent_logits_mean, lang_labels)
643
+ # guard against NaN / Inf in accent_loss
644
+ if not torch.isfinite(accent_loss):
645
+ accent_loss = torch.zeros_like(accent_loss, device=accent_loss.device)
646
+ # accent_loss = torch.zeros_like(loss, device=loss.device, requires_grad=True)
647
+ loss += 0.1 * accent_loss
648
+
649
+ valid_indices = torch.where(time > 0.5)[0]
650
+ # print("torch.where(time > 0.5):", valid_indices, torch.where(time > 0.5))
651
+ if valid_indices.size(0) > 2:
652
+ # 动态选择符合条件的sample
653
+ selected_gt = inp[valid_indices]
654
+ selected_pred = pred[valid_indices]
655
+ selected_text = text[valid_indices]
656
+ selected_lens = lens[valid_indices]
657
+ selected_target_lengths = target_text_lengths[valid_indices]
658
+ # print("pred:", selected_pred.shape, "valid_indices:", valid_indices, "lens:", selected_lens, "target_lengths:", selected_target_lengths)
659
+
660
+ if self.use_spk_enc and valid_indices.size(0) > 2:
661
+ # speaker encoder loss
662
+ e_gt = self.speaker_encoder(selected_gt, selected_lens)
663
+ e_pred = self.speaker_encoder(selected_pred, selected_lens)
664
+ spk_loss = self.info_nce_speaker(e_gt, e_pred)
665
+ if not torch.isnan(spk_loss).any(): # and spk_loss.item() > 1e-6:
666
+ loss = loss + spk_loss * 10.0
667
+ else:
668
+ spk_loss = torch.zeros_like(loss, device=loss.device, requires_grad=False)
669
+ else:
670
+ spk_loss = torch.zeros_like(loss, device=loss.device, requires_grad=False)
671
+ # print("spk_loss:", spk_loss)
672
+
673
+ # ctc loss
674
+ if self.use_ctc_loss and valid_indices.size(0) > 2:
675
+ # 如果t大于0.5 则计算ctc loss
676
+ ctc_loss = self.ctc(
677
+ decoder_outputs=selected_pred,
678
+ target_phones=selected_text,
679
+ decoder_lengths=selected_lens,
680
+ target_lengths=selected_target_lengths,
681
+ )
682
+ # print("loss:", loss, "ctc_loss:", ctc_loss, "time: ", time.shape, time[valid_indices].mean())
683
+ # 如果ctc loss没有nan,才加上ctc loss
684
+ if not torch.isnan(ctc_loss).any() and ctc_loss.item() > 1e-6:
685
+ # ctc_scaled = torch.tanh(torch.log1p(ctc_loss))
686
+ ctc_scaled = ctc_loss
687
+ loss = loss + 0.1 * ctc_scaled
688
+ else:
689
+ ctc_scaled = torch.zeros_like(loss, device=loss.device, requires_grad=False)
690
+ # print("loss:", loss, "ctc_scaled:", ctc_scaled)
691
+ else:
692
+ ctc_scaled = torch.zeros_like(loss, device=loss.device, requires_grad=False)
693
+
694
+
695
+ # 在计算完 total loss 之前
696
+ total_loss = loss # base flow loss + others you added
697
+ # note: we intentionally do NOT add 0.0 * pred.sum() etc. here, to avoid
698
+ # propagating NaNs from intermediate tensors into the loss scalar.
699
+
700
+ return total_loss, ctc_scaled, accent_loss, len(valid_indices), cond, pred # accent_loss,
701
+
702
+
703
+ def forward(self, batchs: Dict[str, torch.Tensor], *, noise_scheduler: str | None = None):
704
+ """
705
+ Simplified forward version for accent-invariant flow matching.
706
+ Removes speaker encoder and CTC parts, keeps accent GRL.
707
+ """
708
+ inp = batchs["mel"].permute(0, 2, 1) # [B, T_mel, D]
709
+ lens = batchs["mel_lengths"]
710
+ text = batchs["text"]
711
+ langs = batchs["langs"]
712
+ audio_16k_list = batchs.get("audio_16k", None)
713
+ prosody_idx_list = batchs.get("prosody_idx", None)
714
+
715
+ # # ---- 4. 随机截取并打乱 segment ----
716
+ # rand_mel = [clip_and_shuffle(spec, spec.shape[-1]) for spec in batchs["mel"]]
717
+
718
+ # padded_rand_mel = []
719
+ # for spec in rand_mel:
720
+ # padding = (0, batchs["mel"].shape[-1] - spec.size(-1))
721
+ # padded_spec = F.pad(spec, padding, value=0)
722
+ # padded_rand_mel.append(padded_spec)
723
+ # rand_mel = torch.stack(padded_rand_mel).permute(0, 2, 1)
724
+ # assert rand_mel.shape == inp.shape, f"shape diff: rand_mel.shape: {rand_mel.shape}, inp.shape: {inp.shape}"
725
+
726
+ if inp.ndim == 2:
727
+ inp = self.mel_spec(inp).permute(0, 2, 1)
728
+ assert inp.shape[-1] == self.num_channels
729
+
730
+ batch, seq_len, dtype, device = *inp.shape[:2], inp.dtype, self.device
731
+
732
+ # --- handle text
733
+ if isinstance(text, list):
734
+ if exists(self.vocab_char_map):
735
+ text = list_str_to_idx(text, self.vocab_char_map).to(device)
736
+ else:
737
+ text = list_str_to_tensor(text).to(device)
738
+ assert text.shape[0] == batch
739
+ # print("text:", batchs["text"][0], text.shape, text[0], batchs["text_lengths"][0])
740
+ # --- prosody conditioning (compute embeddings per sub-utterance)
741
+ prosody_mel_cond = None
742
+ prosody_text_cond = None
743
+ if (
744
+ self.prosody_encoder is not None
745
+ and audio_16k_list is not None
746
+ and prosody_idx_list is not None
747
+ ):
748
+ # prepare zero tensors for each sample
749
+ T_mel = seq_len
750
+ T_text = text.shape[1]
751
+ prosody_mel_cond = torch.zeros(batch, T_mel, 512, device=device, dtype=dtype)
752
+ prosody_text_cond = torch.zeros(batch, T_text, 512, device=device, dtype=dtype)
753
+
754
+ # collect all segments, run encoder per segment
755
+ seg_embeds: list[Tensor] = []
756
+ seg_meta: list[tuple[int, int, int, int, int, int]] = []
757
+ for b in range(batch):
758
+ audio_b = audio_16k_list[b]
759
+ idx_list = prosody_idx_list[b]
760
+ if audio_b is None or idx_list is None:
761
+ continue
762
+ audio_b = audio_b.to(device=device, dtype=dtype)
763
+ for seg in idx_list:
764
+ text_start, text_end, mel_start, mel_end, audio_start, audio_end = seg
765
+ # clamp audio indices
766
+ audio_start = max(0, min(audio_start, audio_b.shape[0] - 1))
767
+ audio_end = max(audio_start + 1, min(audio_end, audio_b.shape[0]))
768
+ audio_seg = audio_b[audio_start:audio_end]
769
+ if audio_seg.numel() == 0:
770
+ continue
771
+ fbank = extract_fbank_16k(audio_seg) # (T_fbank, 80)
772
+ fbank = fbank.unsqueeze(0).to(device=device, dtype=dtype) # (1, T_fbank, 80)
773
+ with torch.no_grad():
774
+ emb = self.prosody_encoder(fbank, padding_mask=None)[0] # (512,)
775
+ seg_embeds.append(emb)
776
+ seg_meta.append(
777
+ (b, text_start, text_end, mel_start, mel_end)
778
+ )
779
+
780
+ if seg_embeds:
781
+ seg_embeds_tensor = torch.stack(seg_embeds, dim=0) # (N_seg, 512)
782
+ # scatter embeddings back to per-sample tensors
783
+ for emb, meta in zip(seg_embeds_tensor, seg_meta):
784
+ b, ts, te, ms, me = meta
785
+ emb_exp = emb.to(device=device, dtype=dtype)
786
+ prosody_mel_cond[b, ms:me, :] = emb_exp
787
+ prosody_text_cond[b, ts:te, :] = emb_exp
788
+
789
+ # dropout on prosody conditioning
790
+ prosody_mel_cond = self.prosody_dropout(prosody_mel_cond)
791
+ prosody_text_cond = self.prosody_dropout(prosody_text_cond)
792
+
793
+ # --- mask & random span
794
+ mask = lens_to_mask(lens, length=seq_len)
795
+ frac_lengths = torch.zeros((batch,), device=device).float().uniform_(*self.frac_lengths_mask)
796
+ rand_span_mask = mask_from_frac_lengths(lens, frac_lengths)
797
+ if exists(mask):
798
+ rand_span_mask &= mask
799
+
800
+ # --- flow setup
801
+ x1 = inp
802
+ x0 = torch.randn_like(x1)
803
+ time = torch.rand((batch,), dtype=dtype, device=device)
804
+ t = time[:, None, None]
805
+ φ = (1 - t) * x0 + t * x1
806
+ flow = x1 - x0
807
+
808
+ # --- conditional input (masked mel) + optional prosody
809
+ cond = torch.where(rand_span_mask[..., None], torch.zeros_like(x1), x1) # x1 # rand_mel
810
+ if prosody_mel_cond is not None:
811
+ prosody_mel_proj = self.prosody_to_mel(prosody_mel_cond) # (B, T_mel, num_channels)
812
+ # if needed, pad/crop to seq_len
813
+ if prosody_mel_proj.size(1) < seq_len:
814
+ pad_len = seq_len - prosody_mel_proj.size(1)
815
+ prosody_mel_proj = F.pad(prosody_mel_proj, (0, 0, 0, pad_len))
816
+ elif prosody_mel_proj.size(1) > seq_len:
817
+ prosody_mel_proj = prosody_mel_proj[:, :seq_len, :]
818
+ cond = cond + prosody_mel_proj
819
+
820
+ # --- Gradient reversal: encourage accent-invariant cond
821
+ cond_grl = grad_reverse(cond, lambda_=1.0)
822
+
823
+ # # --- random drop condition for CFG-like robustness
824
+ # drop_audio_cond = random() < self.audio_drop_prob
825
+ # drop_text_cond = random() < self.text_drop_prob if not drop_audio_cond else True
826
+
827
+ # safe per-batch random (tensor)
828
+ rand_for_drop = torch.rand(1, device=device)
829
+ drop_audio_cond = (rand_for_drop.item() < self.audio_drop_prob)
830
+ rand_for_text = torch.rand(1, device=device)
831
+ drop_text_cond = (rand_for_text.item() < self.text_drop_prob)
832
+
833
+ # --- main prediction
834
+ pred = self.transformer(
835
+ x=φ,
836
+ cond=cond_grl,
837
+ text=text,
838
+ time=time,
839
+ drop_audio_cond=drop_audio_cond,
840
+ drop_text=drop_text_cond,
841
+ prosody_text=prosody_text_cond,
842
+ )
843
+
844
+ # === FLOW LOSS (robust mask-weighted) ===
845
+ pred_clamp = pred.float().clamp(-20, 20)
846
+ per_elem_loss = F.mse_loss(pred_clamp, flow, reduction="none") # [B, T, D]
847
+
848
+ mask_exp = rand_span_mask.unsqueeze(-1).to(dtype=per_elem_loss.dtype) # [B, T, 1]
849
+ masked_loss = per_elem_loss * mask_exp # zeros where mask False
850
+
851
+ # total selected scalar (frames * dim)
852
+ n_selected = mask_exp.sum() * per_elem_loss.size(-1) # scalar
853
+ denom = torch.clamp(n_selected, min=1.0)
854
+
855
+ loss_sum = masked_loss.sum()
856
+ loss = loss_sum / denom
857
+ # numeric safety
858
+ loss = torch.where(torch.isnan(loss) | (loss > 300.0), torch.tensor(300.0, device=loss.device, dtype=loss.dtype), loss)
859
+
860
+ # === ACCENT LOSS ===
861
+ accent_logits = self.accent_classifier(cond_grl)
862
+ # pool across time -> [B, C]
863
+ accent_logits_mean = accent_logits.mean(dim=1)
864
+ lang_labels = langs.to(accent_logits_mean.device).long()
865
+ accent_loss = self.accent_criterion(accent_logits_mean, lang_labels)
866
+ # guard against NaN / Inf in accent_loss
867
+ if not torch.isfinite(accent_loss):
868
+ accent_loss = torch.zeros_like(accent_loss, device=accent_loss.device)
869
+
870
+ base_loss = loss + 0.1 * accent_loss
871
+
872
+ # === OPTIONAL CTC LOSS (robust, only on valid samples) ===
873
+ ctc_scaled = torch.tensor(0.0, device=device, dtype=dtype)
874
+ if getattr(self, "use_ctc_loss", False) and getattr(self, "ctc", None) is not None:
875
+ # select samples with larger t for CTC supervision (similar to forward_old)
876
+ valid_indices = torch.where(time > 0.5)[0]
877
+ if valid_indices.size(0) > 2:
878
+ selected_pred = pred[valid_indices]
879
+ selected_text = text[valid_indices]
880
+ selected_lens = lens[valid_indices]
881
+ # text was tokenized from list_str_to_idx, where padding is -1
882
+ selected_target_lengths = (selected_text != -1).sum(dim=-1)
883
+
884
+ ctc_loss = self.ctc(
885
+ decoder_outputs=selected_pred,
886
+ target_phones=selected_text,
887
+ decoder_lengths=selected_lens,
888
+ target_lengths=selected_target_lengths,
889
+ )
890
+ if torch.isfinite(ctc_loss) and ctc_loss.item() > 1e-6:
891
+ ctc_scaled = ctc_loss
892
+ base_loss = base_loss + 0.1 * ctc_scaled
893
+
894
+ total_loss = base_loss
895
+
896
+ # note: we intentionally do NOT add 0.0 * pred.sum() etc. here, to avoid
897
+ # propagating NaNs from intermediate tensors into the loss scalar.
898
+
899
+ return total_loss, accent_loss, ctc_scaled, cond, pred
lemas_tts/model/modules.py ADDED
@@ -0,0 +1,802 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ein notation:
3
+ b - batch
4
+ n - sequence
5
+ nt - text sequence
6
+ nw - raw wave length
7
+ d - dimension
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import math
13
+ from typing import Optional
14
+
15
+ import torch
16
+ import torch.nn.functional as F
17
+ import torchaudio
18
+ from librosa.filters import mel as librosa_mel_fn
19
+ from torch import nn
20
+ from x_transformers.x_transformers import apply_rotary_pos_emb
21
+ from torch.autograd import Function
22
+
23
+ # raw wav to mel spec
24
+
25
+
26
+ mel_basis_cache = {}
27
+ hann_window_cache = {}
28
+
29
+
30
+ def get_bigvgan_mel_spectrogram(
31
+ waveform,
32
+ n_fft=1024,
33
+ n_mel_channels=100,
34
+ target_sample_rate=24000,
35
+ hop_length=256,
36
+ win_length=1024,
37
+ fmin=0,
38
+ fmax=None,
39
+ center=False,
40
+ ): # Copy from https://github.com/NVIDIA/BigVGAN/tree/main
41
+ device = waveform.device
42
+ key = f"{n_fft}_{n_mel_channels}_{target_sample_rate}_{hop_length}_{win_length}_{fmin}_{fmax}_{device}"
43
+
44
+ if key not in mel_basis_cache:
45
+ mel = librosa_mel_fn(sr=target_sample_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=fmin, fmax=fmax)
46
+ mel_basis_cache[key] = torch.from_numpy(mel).float().to(device) # TODO: why they need .float()?
47
+ hann_window_cache[key] = torch.hann_window(win_length).to(device)
48
+
49
+ mel_basis = mel_basis_cache[key]
50
+ hann_window = hann_window_cache[key]
51
+
52
+ padding = (n_fft - hop_length) // 2
53
+ waveform = torch.nn.functional.pad(waveform.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1)
54
+
55
+ spec = torch.stft(
56
+ waveform,
57
+ n_fft,
58
+ hop_length=hop_length,
59
+ win_length=win_length,
60
+ window=hann_window,
61
+ center=center,
62
+ pad_mode="reflect",
63
+ normalized=False,
64
+ onesided=True,
65
+ return_complex=True,
66
+ )
67
+ spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
68
+
69
+ mel_spec = torch.matmul(mel_basis, spec)
70
+ mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
71
+
72
+ return mel_spec
73
+
74
+
75
+ def get_vocos_mel_spectrogram(
76
+ waveform,
77
+ n_fft=1024,
78
+ n_mel_channels=100,
79
+ target_sample_rate=24000,
80
+ hop_length=256,
81
+ win_length=1024,
82
+ ):
83
+ mel_stft = torchaudio.transforms.MelSpectrogram(
84
+ sample_rate=target_sample_rate,
85
+ n_fft=n_fft,
86
+ win_length=win_length,
87
+ hop_length=hop_length,
88
+ n_mels=n_mel_channels,
89
+ power=1,
90
+ center=True,
91
+ normalized=False,
92
+ norm=None,
93
+ ).to(waveform.device)
94
+ if len(waveform.shape) == 3:
95
+ waveform = waveform.squeeze(1) # 'b 1 nw -> b nw'
96
+
97
+ assert len(waveform.shape) == 2
98
+
99
+ mel = mel_stft(waveform)
100
+ mel = mel.clamp(min=1e-5).log()
101
+ return mel
102
+
103
+
104
+ class MelSpec(nn.Module):
105
+ def __init__(
106
+ self,
107
+ n_fft=1024,
108
+ hop_length=256,
109
+ win_length=1024,
110
+ n_mel_channels=100,
111
+ target_sample_rate=24_000,
112
+ mel_spec_type="vocos",
113
+ ):
114
+ super().__init__()
115
+ assert mel_spec_type in ["vocos", "bigvgan"], print("We only support two extract mel backend: vocos or bigvgan")
116
+
117
+ self.n_fft = n_fft
118
+ self.hop_length = hop_length
119
+ self.win_length = win_length
120
+ self.n_mel_channels = n_mel_channels
121
+ self.target_sample_rate = target_sample_rate
122
+
123
+ if mel_spec_type == "vocos":
124
+ self.extractor = get_vocos_mel_spectrogram
125
+ elif mel_spec_type == "bigvgan":
126
+ self.extractor = get_bigvgan_mel_spectrogram
127
+
128
+ self.register_buffer("dummy", torch.tensor(0), persistent=False)
129
+
130
+ def forward(self, wav):
131
+ if self.dummy.device != wav.device:
132
+ self.to(wav.device)
133
+
134
+ mel = self.extractor(
135
+ waveform=wav,
136
+ n_fft=self.n_fft,
137
+ n_mel_channels=self.n_mel_channels,
138
+ target_sample_rate=self.target_sample_rate,
139
+ hop_length=self.hop_length,
140
+ win_length=self.win_length,
141
+ )
142
+
143
+ return mel
144
+
145
+
146
+ # sinusoidal position embedding
147
+
148
+
149
+ class SinusPositionEmbedding(nn.Module):
150
+ def __init__(self, dim):
151
+ super().__init__()
152
+ self.dim = dim
153
+
154
+ def forward(self, x, scale=1000):
155
+ device = x.device
156
+ half_dim = self.dim // 2
157
+ emb = math.log(10000) / (half_dim - 1)
158
+ emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
159
+ emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
160
+ emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
161
+ return emb
162
+
163
+
164
+ # convolutional position embedding
165
+
166
+
167
+ class ConvPositionEmbedding(nn.Module):
168
+ def __init__(self, dim, kernel_size=31, groups=16):
169
+ super().__init__()
170
+ assert kernel_size % 2 != 0
171
+ self.conv1d = nn.Sequential(
172
+ nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
173
+ nn.Mish(),
174
+ nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
175
+ nn.Mish(),
176
+ )
177
+
178
+ def forward(self, x: float["b n d"], mask: bool["b n"] | None = None): # noqa: F722
179
+ if mask is not None:
180
+ mask = mask[..., None]
181
+ x = x.masked_fill(~mask, 0.0)
182
+
183
+ x = x.permute(0, 2, 1)
184
+ x = self.conv1d(x)
185
+ out = x.permute(0, 2, 1)
186
+
187
+ if mask is not None:
188
+ out = out.masked_fill(~mask, 0.0)
189
+
190
+ return out
191
+
192
+
193
+ # rotary positional embedding related
194
+
195
+
196
+ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0):
197
+ # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
198
+ # has some connection to NTK literature
199
+ # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
200
+ # https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
201
+ theta *= theta_rescale_factor ** (dim / (dim - 2))
202
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
203
+ t = torch.arange(end, device=freqs.device) # type: ignore
204
+ freqs = torch.outer(t, freqs).float() # type: ignore
205
+ freqs_cos = torch.cos(freqs) # real part
206
+ freqs_sin = torch.sin(freqs) # imaginary part
207
+ return torch.cat([freqs_cos, freqs_sin], dim=-1)
208
+
209
+
210
+ def get_pos_embed_indices(start, length, max_pos, scale=1.0):
211
+ # length = length if isinstance(length, int) else length.max()
212
+ scale = scale * torch.ones_like(start, dtype=torch.float32) # in case scale is a scalar
213
+ pos = (
214
+ start.unsqueeze(1)
215
+ + (torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0) * scale.unsqueeze(1)).long()
216
+ )
217
+ # avoid extra long error.
218
+ pos = torch.where(pos < max_pos, pos, max_pos - 1)
219
+ return pos
220
+
221
+
222
+ # Global Response Normalization layer (Instance Normalization ?)
223
+
224
+
225
+ class GRN(nn.Module):
226
+ def __init__(self, dim):
227
+ super().__init__()
228
+ self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
229
+ self.beta = nn.Parameter(torch.zeros(1, 1, dim))
230
+
231
+ def forward(self, x):
232
+ Gx = torch.norm(x, p=2, dim=1, keepdim=True)
233
+ Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
234
+ return self.gamma * (x * Nx) + self.beta + x
235
+
236
+
237
+ # ConvNeXt-V2 Block https://github.com/facebookresearch/ConvNeXt-V2/blob/main/models/convnextv2.py
238
+ # ref: https://github.com/bfs18/e2_tts/blob/main/rfwave/modules.py#L108
239
+
240
+
241
+ class ConvNeXtV2Block(nn.Module):
242
+ def __init__(
243
+ self,
244
+ dim: int,
245
+ intermediate_dim: int,
246
+ dilation: int = 1,
247
+ ):
248
+ super().__init__()
249
+ padding = (dilation * (7 - 1)) // 2
250
+ self.dwconv = nn.Conv1d(
251
+ dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
252
+ ) # depthwise conv
253
+ self.norm = nn.LayerNorm(dim, eps=1e-6)
254
+ self.pwconv1 = nn.Linear(dim, intermediate_dim) # pointwise/1x1 convs, implemented with linear layers
255
+ self.act = nn.GELU()
256
+ self.grn = GRN(intermediate_dim)
257
+ self.pwconv2 = nn.Linear(intermediate_dim, dim)
258
+
259
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
260
+ residual = x
261
+ x = x.transpose(1, 2) # b n d -> b d n
262
+ x = self.dwconv(x)
263
+ x = x.transpose(1, 2) # b d n -> b n d
264
+ x = self.norm(x)
265
+ x = self.pwconv1(x)
266
+ x = self.act(x)
267
+ x = self.grn(x)
268
+ x = self.pwconv2(x)
269
+ return residual + x
270
+
271
+
272
+ # RMSNorm
273
+
274
+
275
+ class RMSNorm(nn.Module):
276
+ def __init__(self, dim: int, eps: float):
277
+ super().__init__()
278
+ self.eps = eps
279
+ self.weight = nn.Parameter(torch.ones(dim))
280
+ self.native_rms_norm = float(torch.__version__[:3]) >= 2.4
281
+
282
+ def forward(self, x):
283
+ if self.native_rms_norm:
284
+ if self.weight.dtype in [torch.float16, torch.bfloat16]:
285
+ x = x.to(self.weight.dtype)
286
+ x = F.rms_norm(x, normalized_shape=(x.shape[-1],), weight=self.weight, eps=self.eps)
287
+ else:
288
+ variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True)
289
+ x = x * torch.rsqrt(variance + self.eps)
290
+ if self.weight.dtype in [torch.float16, torch.bfloat16]:
291
+ x = x.to(self.weight.dtype)
292
+ x = x * self.weight
293
+
294
+ return x
295
+
296
+
297
+ # AdaLayerNorm
298
+ # return with modulated x for attn input, and params for later mlp modulation
299
+
300
+
301
+ class AdaLayerNorm(nn.Module):
302
+ def __init__(self, dim):
303
+ super().__init__()
304
+
305
+ self.silu = nn.SiLU()
306
+ self.linear = nn.Linear(dim, dim * 6)
307
+
308
+ self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
309
+
310
+ def forward(self, x, emb=None):
311
+ emb = self.linear(self.silu(emb))
312
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(emb, 6, dim=1)
313
+
314
+ x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
315
+ return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
316
+
317
+
318
+ # AdaLayerNorm for final layer
319
+ # return only with modulated x for attn input, cuz no more mlp modulation
320
+
321
+
322
+ class AdaLayerNorm_Final(nn.Module):
323
+ def __init__(self, dim):
324
+ super().__init__()
325
+
326
+ self.silu = nn.SiLU()
327
+ self.linear = nn.Linear(dim, dim * 2)
328
+
329
+ self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
330
+
331
+ def forward(self, x, emb):
332
+ emb = self.linear(self.silu(emb))
333
+ scale, shift = torch.chunk(emb, 2, dim=1)
334
+
335
+ x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
336
+ return x
337
+
338
+
339
+ # FeedForward
340
+
341
+
342
+ class FeedForward(nn.Module):
343
+ def __init__(self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"):
344
+ super().__init__()
345
+ inner_dim = int(dim * mult)
346
+ dim_out = dim_out if dim_out is not None else dim
347
+
348
+ activation = nn.GELU(approximate=approximate)
349
+ project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
350
+ self.ff = nn.Sequential(project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out))
351
+
352
+ def forward(self, x):
353
+ return self.ff(x)
354
+
355
+
356
+ # Attention with possible joint part
357
+ # modified from diffusers/src/diffusers/models/attention_processor.py
358
+
359
+
360
+ class Attention(nn.Module):
361
+ def __init__(
362
+ self,
363
+ processor: JointAttnProcessor | AttnProcessor,
364
+ dim: int,
365
+ heads: int = 8,
366
+ dim_head: int = 64,
367
+ dropout: float = 0.0,
368
+ context_dim: Optional[int] = None, # if not None -> joint attention
369
+ context_pre_only: bool = False,
370
+ qk_norm: Optional[str] = None,
371
+ ):
372
+ super().__init__()
373
+
374
+ if not hasattr(F, "scaled_dot_product_attention"):
375
+ raise ImportError("Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
376
+
377
+ self.processor = processor
378
+
379
+ self.dim = dim
380
+ self.heads = heads
381
+ self.inner_dim = dim_head * heads
382
+ self.dropout = dropout
383
+
384
+ self.context_dim = context_dim
385
+ self.context_pre_only = context_pre_only
386
+
387
+ self.to_q = nn.Linear(dim, self.inner_dim)
388
+ self.to_k = nn.Linear(dim, self.inner_dim)
389
+ self.to_v = nn.Linear(dim, self.inner_dim)
390
+
391
+ if qk_norm is None:
392
+ self.q_norm = None
393
+ self.k_norm = None
394
+ elif qk_norm == "rms_norm":
395
+ self.q_norm = RMSNorm(dim_head, eps=1e-6)
396
+ self.k_norm = RMSNorm(dim_head, eps=1e-6)
397
+ else:
398
+ raise ValueError(f"Unimplemented qk_norm: {qk_norm}")
399
+
400
+ if self.context_dim is not None:
401
+ self.to_q_c = nn.Linear(context_dim, self.inner_dim)
402
+ self.to_k_c = nn.Linear(context_dim, self.inner_dim)
403
+ self.to_v_c = nn.Linear(context_dim, self.inner_dim)
404
+ if qk_norm is None:
405
+ self.c_q_norm = None
406
+ self.c_k_norm = None
407
+ elif qk_norm == "rms_norm":
408
+ self.c_q_norm = RMSNorm(dim_head, eps=1e-6)
409
+ self.c_k_norm = RMSNorm(dim_head, eps=1e-6)
410
+
411
+ self.to_out = nn.ModuleList([])
412
+ self.to_out.append(nn.Linear(self.inner_dim, dim))
413
+ self.to_out.append(nn.Dropout(dropout))
414
+
415
+ if self.context_dim is not None and not self.context_pre_only:
416
+ self.to_out_c = nn.Linear(self.inner_dim, context_dim)
417
+
418
+ def forward(
419
+ self,
420
+ x: float["b n d"], # noised input x # noqa: F722
421
+ c: float["b n d"] = None, # context c # noqa: F722
422
+ mask: bool["b n"] | None = None, # noqa: F722
423
+ rope=None, # rotary position embedding for x
424
+ c_rope=None, # rotary position embedding for c
425
+ ) -> torch.Tensor:
426
+ if c is not None:
427
+ return self.processor(self, x, c=c, mask=mask, rope=rope, c_rope=c_rope)
428
+ else:
429
+ return self.processor(self, x, mask=mask, rope=rope)
430
+
431
+
432
+ # Attention processor
433
+
434
+
435
+ class AttnProcessor:
436
+ def __init__(
437
+ self,
438
+ pe_attn_head: int | None = None, # number of attention head to apply rope, None for all
439
+ ):
440
+ self.pe_attn_head = pe_attn_head
441
+
442
+ def __call__(
443
+ self,
444
+ attn: Attention,
445
+ x: float["b n d"], # noised input x # noqa: F722
446
+ mask: bool["b n"] | None = None, # noqa: F722
447
+ rope=None, # rotary position embedding
448
+ ) -> torch.FloatTensor:
449
+ batch_size = x.shape[0]
450
+
451
+ # `sample` projections
452
+ query = attn.to_q(x)
453
+ key = attn.to_k(x)
454
+ value = attn.to_v(x)
455
+
456
+ # attention
457
+ inner_dim = key.shape[-1]
458
+ head_dim = inner_dim // attn.heads
459
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
460
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
461
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
462
+
463
+ # qk norm
464
+ if attn.q_norm is not None:
465
+ query = attn.q_norm(query)
466
+ if attn.k_norm is not None:
467
+ key = attn.k_norm(key)
468
+
469
+ # apply rotary position embedding
470
+ if rope is not None:
471
+ freqs, xpos_scale = rope
472
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
473
+
474
+ if self.pe_attn_head is not None:
475
+ pn = self.pe_attn_head
476
+ query[:, :pn, :, :] = apply_rotary_pos_emb(query[:, :pn, :, :], freqs, q_xpos_scale)
477
+ key[:, :pn, :, :] = apply_rotary_pos_emb(key[:, :pn, :, :], freqs, k_xpos_scale)
478
+ else:
479
+ query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
480
+ key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
481
+
482
+ # mask. e.g. inference got a batch with different target durations, mask out the padding
483
+ if mask is not None:
484
+ attn_mask = mask
485
+ attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
486
+ attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
487
+ else:
488
+ attn_mask = None
489
+
490
+ x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
491
+ x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
492
+ x = x.to(query.dtype)
493
+
494
+ # linear proj
495
+ x = attn.to_out[0](x)
496
+ # dropout
497
+ x = attn.to_out[1](x)
498
+
499
+ if mask is not None:
500
+ mask = mask.unsqueeze(-1)
501
+ x = x.masked_fill(~mask, 0.0)
502
+
503
+ return x
504
+
505
+
506
+ # Joint Attention processor for MM-DiT
507
+ # modified from diffusers/src/diffusers/models/attention_processor.py
508
+
509
+
510
+ class JointAttnProcessor:
511
+ def __init__(self):
512
+ pass
513
+
514
+ def __call__(
515
+ self,
516
+ attn: Attention,
517
+ x: float["b n d"], # noised input x # noqa: F722
518
+ c: float["b nt d"] = None, # context c, here text # noqa: F722
519
+ mask: bool["b n"] | None = None, # noqa: F722
520
+ rope=None, # rotary position embedding for x
521
+ c_rope=None, # rotary position embedding for c
522
+ ) -> torch.FloatTensor:
523
+ residual = x
524
+
525
+ batch_size = c.shape[0]
526
+
527
+ # `sample` projections
528
+ query = attn.to_q(x)
529
+ key = attn.to_k(x)
530
+ value = attn.to_v(x)
531
+
532
+ # `context` projections
533
+ c_query = attn.to_q_c(c)
534
+ c_key = attn.to_k_c(c)
535
+ c_value = attn.to_v_c(c)
536
+
537
+ # attention
538
+ inner_dim = key.shape[-1]
539
+ head_dim = inner_dim // attn.heads
540
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
541
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
542
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
543
+ c_query = c_query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
544
+ c_key = c_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
545
+ c_value = c_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
546
+
547
+ # qk norm
548
+ if attn.q_norm is not None:
549
+ query = attn.q_norm(query)
550
+ if attn.k_norm is not None:
551
+ key = attn.k_norm(key)
552
+ if attn.c_q_norm is not None:
553
+ c_query = attn.c_q_norm(c_query)
554
+ if attn.c_k_norm is not None:
555
+ c_key = attn.c_k_norm(c_key)
556
+
557
+ # apply rope for context and noised input independently
558
+ if rope is not None:
559
+ freqs, xpos_scale = rope
560
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
561
+ query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
562
+ key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
563
+ if c_rope is not None:
564
+ freqs, xpos_scale = c_rope
565
+ q_xpos_scale, k_xpos_scale = (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
566
+ c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
567
+ c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
568
+
569
+ # joint attention
570
+ query = torch.cat([query, c_query], dim=2)
571
+ key = torch.cat([key, c_key], dim=2)
572
+ value = torch.cat([value, c_value], dim=2)
573
+
574
+ # mask. e.g. inference got a batch with different target durations, mask out the padding
575
+ if mask is not None:
576
+ attn_mask = F.pad(mask, (0, c.shape[1]), value=True) # no mask for c (text)
577
+ attn_mask = attn_mask.unsqueeze(1).unsqueeze(1) # 'b n -> b 1 1 n'
578
+ attn_mask = attn_mask.expand(batch_size, attn.heads, query.shape[-2], key.shape[-2])
579
+ else:
580
+ attn_mask = None
581
+
582
+ x = F.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False)
583
+ x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
584
+ x = x.to(query.dtype)
585
+
586
+ # Split the attention outputs.
587
+ x, c = (
588
+ x[:, : residual.shape[1]],
589
+ x[:, residual.shape[1] :],
590
+ )
591
+
592
+ # linear proj
593
+ x = attn.to_out[0](x)
594
+ # dropout
595
+ x = attn.to_out[1](x)
596
+ if not attn.context_pre_only:
597
+ c = attn.to_out_c(c)
598
+
599
+ if mask is not None:
600
+ mask = mask.unsqueeze(-1)
601
+ x = x.masked_fill(~mask, 0.0)
602
+ # c = c.masked_fill(~mask, 0.) # no mask for c (text)
603
+
604
+ return x, c
605
+
606
+
607
+ # DiT Block
608
+
609
+
610
+ class DiTBlock(nn.Module):
611
+ def __init__(self, dim, heads, dim_head, ff_mult=4, dropout=0.1, qk_norm=None, pe_attn_head=None):
612
+ super().__init__()
613
+
614
+ self.attn_norm = AdaLayerNorm(dim)
615
+ self.attn = Attention(
616
+ processor=AttnProcessor(pe_attn_head=pe_attn_head),
617
+ dim=dim,
618
+ heads=heads,
619
+ dim_head=dim_head,
620
+ dropout=dropout,
621
+ qk_norm=qk_norm,
622
+ )
623
+
624
+ self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
625
+ self.ff = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
626
+
627
+ def forward(self, x, t, mask=None, rope=None): # x: noised input, t: time embedding
628
+ # pre-norm & modulation for attention input
629
+ norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
630
+
631
+ # attention
632
+ attn_output = self.attn(x=norm, mask=mask, rope=rope)
633
+
634
+ # process attention output for input x
635
+ x = x + gate_msa.unsqueeze(1) * attn_output
636
+
637
+ norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
638
+ ff_output = self.ff(norm)
639
+ x = x + gate_mlp.unsqueeze(1) * ff_output
640
+
641
+ return x
642
+
643
+
644
+ # MMDiT Block https://arxiv.org/abs/2403.03206
645
+
646
+
647
+ class MMDiTBlock(nn.Module):
648
+ r"""
649
+ modified from diffusers/src/diffusers/models/attention.py
650
+
651
+ notes.
652
+ _c: context related. text, cond, etc. (left part in sd3 fig2.b)
653
+ _x: noised input related. (right part)
654
+ context_pre_only: last layer only do prenorm + modulation cuz no more ffn
655
+ """
656
+
657
+ def __init__(
658
+ self, dim, heads, dim_head, ff_mult=4, dropout=0.1, context_dim=None, context_pre_only=False, qk_norm=None
659
+ ):
660
+ super().__init__()
661
+ if context_dim is None:
662
+ context_dim = dim
663
+ self.context_pre_only = context_pre_only
664
+
665
+ self.attn_norm_c = AdaLayerNorm_Final(context_dim) if context_pre_only else AdaLayerNorm(context_dim)
666
+ self.attn_norm_x = AdaLayerNorm(dim)
667
+ self.attn = Attention(
668
+ processor=JointAttnProcessor(),
669
+ dim=dim,
670
+ heads=heads,
671
+ dim_head=dim_head,
672
+ dropout=dropout,
673
+ context_dim=context_dim,
674
+ context_pre_only=context_pre_only,
675
+ qk_norm=qk_norm,
676
+ )
677
+
678
+ if not context_pre_only:
679
+ self.ff_norm_c = nn.LayerNorm(context_dim, elementwise_affine=False, eps=1e-6)
680
+ self.ff_c = FeedForward(dim=context_dim, mult=ff_mult, dropout=dropout, approximate="tanh")
681
+ else:
682
+ self.ff_norm_c = None
683
+ self.ff_c = None
684
+ self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
685
+ self.ff_x = FeedForward(dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh")
686
+
687
+ def forward(self, x, c, t, mask=None, rope=None, c_rope=None): # x: noised input, c: context, t: time embedding
688
+ # pre-norm & modulation for attention input
689
+ if self.context_pre_only:
690
+ norm_c = self.attn_norm_c(c, t)
691
+ else:
692
+ norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(c, emb=t)
693
+ norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(x, emb=t)
694
+
695
+ # attention
696
+ x_attn_output, c_attn_output = self.attn(x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope)
697
+
698
+ # process attention output for context c
699
+ if self.context_pre_only:
700
+ c = None
701
+ else: # if not last layer
702
+ c = c + c_gate_msa.unsqueeze(1) * c_attn_output
703
+
704
+ norm_c = self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
705
+ c_ff_output = self.ff_c(norm_c)
706
+ c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
707
+
708
+ # process attention output for input x
709
+ x = x + x_gate_msa.unsqueeze(1) * x_attn_output
710
+
711
+ norm_x = self.ff_norm_x(x) * (1 + x_scale_mlp[:, None]) + x_shift_mlp[:, None]
712
+ x_ff_output = self.ff_x(norm_x)
713
+ x = x + x_gate_mlp.unsqueeze(1) * x_ff_output
714
+
715
+ return c, x
716
+
717
+
718
+ # time step conditioning embedding
719
+
720
+
721
+ class TimestepEmbedding(nn.Module):
722
+ def __init__(self, dim, freq_embed_dim=256):
723
+ super().__init__()
724
+ self.time_embed = SinusPositionEmbedding(freq_embed_dim)
725
+ self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
726
+
727
+ def forward(self, timestep: float["b"]): # noqa: F821
728
+ time_hidden = self.time_embed(timestep)
729
+ time_hidden = time_hidden.to(timestep.dtype)
730
+ time = self.time_mlp(time_hidden) # b d
731
+ return time
732
+
733
+
734
+ class MIEsitmator(nn.Module):
735
+ def __init__(self, vocab_size, decoder_dim, hidden_size, dropout=0.5):
736
+ super(MIEsitmator, self).__init__()
737
+ self.proj = nn.Sequential(
738
+ torch.nn.Linear(decoder_dim, hidden_size, bias=True),
739
+ nn.ReLU(),
740
+ nn.Dropout(p=dropout)
741
+ )
742
+ self.ctc_proj = torch.nn.Linear(hidden_size, vocab_size + 1, bias=True)
743
+ self.ctc = nn.CTCLoss(blank=vocab_size, reduction='mean', zero_infinity=True)
744
+
745
+ def forward(self, decoder_outputs, target_phones, decoder_lengths, target_lengths):
746
+ out = self.proj(decoder_outputs.type(self.ctc_proj.weight.dtype))
747
+ log_probs = self.ctc_proj(out).log_softmax(dim=2)
748
+ log_probs = log_probs.transpose(1, 0)
749
+ ctc_loss = self.ctc(log_probs.float(), target_phones, decoder_lengths, target_lengths)
750
+ ctc_loss = ctc_loss / decoder_lengths.float()
751
+
752
+ # print("ctc_loss:", ctc_loss.shape, "ctc_max:", torch.max(ctc_loss), "ctc_min:", torch.min(ctc_loss), decoder_lengths[0])
753
+
754
+ # # 2. 统计非NaN值的百分比
755
+ # mask = ~torch.isnan(ctc_loss)
756
+ # total_count = ctc_loss.numel() # 总元素数量(所有维度)
757
+ # valid_count = mask.sum().item() # 非NaN元素数量
758
+ # valid_percentage = (valid_count / total_count) * 100
759
+ # print(f"ctc loss: total_count: {total_count}", f"valid_count: {valid_count}", f"valid_percentage: {valid_percentage:.2f}%")
760
+
761
+ # 3. 将NaN或大于150的值替换为150
762
+ # ctc_loss = torch.where(torch.isnan(ctc_loss), 150.0, ctc_loss)
763
+ ctc_loss = torch.where((ctc_loss > 300.0) | torch.isnan(ctc_loss), 300.0, ctc_loss)
764
+ # ctc_loss = torch.nan_to_num(ctc_loss, nan=0.0, posinf=0.0, neginf=0.0)
765
+ # average by number of frames since taco_loss is averaged.
766
+ ctc_loss = ctc_loss.mean()
767
+ return ctc_loss
768
+
769
+ def inference(self, decoder_output):
770
+ out = self.proj(decoder_output.type(self.ctc_proj.weight.dtype))
771
+ log_probs = self.ctc_proj(out).log_softmax(dim=2)
772
+ log_probs = log_probs.transpose(1, 0)
773
+ return log_probs.item()
774
+
775
+
776
+ class AccentClassifier(nn.Module):
777
+ def __init__(self, input_dim, hidden_dim, num_accents, dropout=0.3):
778
+ super().__init__()
779
+ self.net = nn.Sequential(
780
+ nn.Linear(input_dim, hidden_dim),
781
+ nn.ReLU(),
782
+ nn.Dropout(dropout),
783
+ nn.Linear(hidden_dim, num_accents)
784
+ )
785
+
786
+ def forward(self, x):
787
+ return self.net(x)
788
+
789
+
790
+ class GradientReversalFunction(Function):
791
+ @staticmethod
792
+ def forward(ctx, x, lambda_):
793
+ ctx.lambda_ = lambda_
794
+ return x.view_as(x)
795
+
796
+ @staticmethod
797
+ def backward(ctx, grad_output):
798
+ return grad_output.neg() * ctx.lambda_, None
799
+
800
+ def grad_reverse(x, lambda_=1.0):
801
+ return GradientReversalFunction.apply(x, lambda_)
802
+
lemas_tts/model/utils.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import random
5
+ from collections import defaultdict
6
+ from importlib.resources import files
7
+
8
+ import torch
9
+ from torch.nn.utils.rnn import pad_sequence
10
+
11
+ import jieba
12
+ from pypinyin import lazy_pinyin, Style
13
+ import sys
14
+
15
+ # seed everything
16
+
17
+
18
+ def seed_everything(seed=0):
19
+ random.seed(seed)
20
+ os.environ["PYTHONHASHSEED"] = str(seed)
21
+ torch.manual_seed(seed)
22
+ torch.cuda.manual_seed(seed)
23
+ torch.cuda.manual_seed_all(seed)
24
+ torch.backends.cudnn.deterministic = True
25
+ torch.backends.cudnn.benchmark = False
26
+
27
+
28
+ # helpers
29
+
30
+
31
+ def exists(v):
32
+ return v is not None
33
+
34
+
35
+ def default(v, d):
36
+ return v if exists(v) else d
37
+
38
+
39
+ # tensor helpers
40
+
41
+
42
+ def lens_to_mask(t: int["b"], length: int | None = None) -> bool["b n"]: # noqa: F722 F821
43
+ if not exists(length):
44
+ length = t.amax()
45
+
46
+ seq = torch.arange(length, device=t.device)
47
+ return seq[None, :] < t[:, None]
48
+
49
+
50
+ def mask_from_start_end_indices(seq_len: int["b"], start: int["b"], end: int["b"]): # noqa: F722 F821
51
+ max_seq_len = seq_len.max().item()
52
+ seq = torch.arange(max_seq_len, device=start.device).long()
53
+ start_mask = seq[None, :] >= start[:, None]
54
+ end_mask = seq[None, :] < end[:, None]
55
+ return start_mask & end_mask
56
+
57
+
58
+ def mask_from_frac_lengths(seq_len: int["b"], frac_lengths: float["b"]): # noqa: F722 F821
59
+ lengths = (frac_lengths * seq_len).long()
60
+ max_start = seq_len - lengths
61
+
62
+ rand = torch.rand_like(frac_lengths)
63
+ start = (max_start * rand).long().clamp(min=0)
64
+ end = start + lengths
65
+
66
+ return mask_from_start_end_indices(seq_len, start, end)
67
+
68
+
69
+ def maybe_masked_mean(t: float["b n d"], mask: bool["b n"] = None) -> float["b d"]: # noqa: F722
70
+ if not exists(mask):
71
+ return t.mean(dim=1)
72
+
73
+ t = torch.where(mask[:, :, None], t, torch.tensor(0.0, device=t.device))
74
+ num = t.sum(dim=1)
75
+ den = mask.float().sum(dim=1)
76
+
77
+ return num / den.clamp(min=1.0)
78
+
79
+
80
+ # simple utf-8 tokenizer, since paper went character based
81
+ def list_str_to_tensor(text: list[str], padding_value=-1) -> int["b nt"]: # noqa: F722
82
+ list_tensors = [torch.tensor([*bytes(t, "UTF-8")]) for t in text] # ByT5 style
83
+ text = pad_sequence(list_tensors, padding_value=padding_value, batch_first=True)
84
+ return text
85
+
86
+ # char tokenizer, based on custom dataset's extracted .txt file
87
+ def list_str_to_idx(
88
+ text: list[str] | list[list[str]],
89
+ vocab_char_map: dict[str, int], # {char: idx}
90
+ padding_value=-1,
91
+ ) -> int["b nt"]: # noqa: F722
92
+ list_idx_tensors = [torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text] # pinyin or char style
93
+ text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
94
+ return text
95
+
96
+
97
+ # Get tokenizer
98
+ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
99
+ """
100
+ tokenizer - "pinyin" do g2p for only chinese characters, need .txt vocab_file
101
+ - "char" for char-wise tokenizer, need .txt vocab_file
102
+ - "byte" for utf-8 tokenizer
103
+ - "custom" if you're directly passing in a path to the vocab.txt you want to use
104
+ vocab_size - if use "pinyin", all available pinyin types, common alphabets (also those with accent) and symbols
105
+ - if use "char", derived from unfiltered character & symbol counts of custom dataset
106
+ - if use "byte", set to 256 (unicode byte range)
107
+ """
108
+ if tokenizer in ["pinyin", "char"]:
109
+ tokenizer_path = os.path.join(files("lemas_tts").joinpath("../../data"), f"{dataset_name}_{tokenizer}/vocab.txt")
110
+ with open(tokenizer_path, "r", encoding="utf-8") as f:
111
+ vocab_char_map = {}
112
+ for i, char in enumerate(f):
113
+ vocab_char_map[char[:-1]] = i
114
+ vocab_size = len(vocab_char_map)
115
+ assert vocab_char_map[" "] == 0, "make sure space is of idx 0 in vocab.txt, cuz 0 is used for unknown char"
116
+
117
+ elif tokenizer == "byte":
118
+ vocab_char_map = None
119
+ vocab_size = 256
120
+
121
+ elif tokenizer == "custom":
122
+ with open(dataset_name, "r", encoding="utf-8") as f:
123
+ vocab_char_map = {}
124
+ for i, char in enumerate(f):
125
+ vocab_char_map[char[:-1]] = i
126
+ vocab_size = len(vocab_char_map)
127
+
128
+ return vocab_char_map, vocab_size
129
+
130
+
131
+ # convert char to pinyin
132
+ def convert_char_to_pinyin(text_list, polyphone=True):
133
+ if jieba.dt.initialized is False:
134
+ jieba.default_logger.setLevel(50) # CRITICAL
135
+ jieba.initialize()
136
+
137
+ final_text_list = []
138
+ custom_trans = str.maketrans(
139
+ {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'"}
140
+ ) # add custom trans here, to address oov
141
+
142
+ def is_chinese(c):
143
+ return (
144
+ "\u3100" <= c <= "\u9fff" # common chinese characters
145
+ )
146
+
147
+ for text in text_list:
148
+ char_list = []
149
+ text = text.translate(custom_trans)
150
+ from lemas_tts.infer.cn_tn import NSWNormalizer
151
+ text = NSWNormalizer(text.strip()).normalize()
152
+ text = list(jieba.cut(text))
153
+ for seg in text:
154
+ seg_byte_len = len(bytes(seg, "UTF-8"))
155
+ if seg_byte_len == len(seg): # if pure alphabets and symbols
156
+ if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
157
+ char_list.append(" ")
158
+ char_list.extend(seg)
159
+ elif polyphone and seg_byte_len == 3 * len(seg): # if pure east asian characters
160
+ seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
161
+ for i, c in enumerate(seg):
162
+ if is_chinese(c):
163
+ char_list.append(" ")
164
+ char_list.append(seg_[i])
165
+ else: # if mixed characters, alphabets and symbols
166
+ for c in seg:
167
+ if ord(c) < 256:
168
+ char_list.extend(c)
169
+ elif is_chinese(c):
170
+ char_list.append(" ")
171
+ char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
172
+ else:
173
+ char_list.append(c)
174
+ final_text_list.append(char_list)
175
+
176
+ return final_text_list
177
+
178
+
179
+ # filter func for dirty data with many repetitions
180
+
181
+
182
+ def repetition_found(text, length=2, tolerance=10):
183
+ pattern_count = defaultdict(int)
184
+ for i in range(len(text) - length + 1):
185
+ pattern = text[i : i + length]
186
+ pattern_count[pattern] += 1
187
+ for pattern, count in pattern_count.items():
188
+ if count > tolerance:
189
+ return True
190
+ return False
lemas_tts/scripts/inference_gradio.py ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import os
3
+ import platform
4
+ import psutil
5
+ import tempfile
6
+ from glob import glob
7
+ import traceback
8
+ import click
9
+ import gradio as gr
10
+ import torch
11
+
12
+ import sys
13
+ from pathlib import Path
14
+
15
+ # Add the local code directory so that `lemas_tts` can be imported when running this
16
+ # script directly without installing the package.
17
+ THIS_FILE = Path(__file__).resolve()
18
+ SRC_ROOT = THIS_FILE.parents[2] # .../code
19
+ sys.path.append(str(SRC_ROOT))
20
+
21
+
22
+ def _find_repo_root(start: Path) -> Path:
23
+ """Locate the repo root by looking for a `pretrained_models` folder upwards."""
24
+ for p in [start, *start.parents]:
25
+ if (p / "pretrained_models").is_dir():
26
+ return p
27
+ cwd = Path.cwd()
28
+ if (cwd / "pretrained_models").is_dir():
29
+ return cwd
30
+ return start
31
+
32
+
33
+ REPO_ROOT = _find_repo_root(THIS_FILE)
34
+ PRETRAINED_ROOT = REPO_ROOT / "pretrained_models"
35
+ CKPTS_ROOT = PRETRAINED_ROOT / "ckpts"
36
+ DATA_ROOT = PRETRAINED_ROOT / "data"
37
+ UVR5_CODE_DIR = REPO_ROOT / "code" / "uvr5"
38
+ UVR5_MODEL_DIR = PRETRAINED_ROOT / "uvr5" / "models" / "MDX_Net_Models" / "model_data"
39
+
40
+ from lemas_tts.api import F5TTS
41
+ import torch, torchaudio
42
+ import soundfile as sf
43
+
44
+ # Global variables
45
+ tts_api = None
46
+ last_checkpoint = ""
47
+ last_device = ""
48
+ last_ema = None
49
+
50
+ # Device detection
51
+ device = (
52
+ "cuda"
53
+ if torch.cuda.is_available()
54
+ else "xpu"
55
+ if torch.xpu.is_available()
56
+ else "mps"
57
+ if torch.backends.mps.is_available()
58
+ else "cpu"
59
+ )
60
+
61
+
62
+ class UVR5:
63
+ def __init__(self, model_dir):
64
+ code_dir = str(UVR5_CODE_DIR)
65
+ self.model = self.load_model(str(model_dir), code_dir)
66
+
67
+ def load_model(self, model_dir, code_dir):
68
+ import sys, json, os
69
+ sys.path.append(code_dir)
70
+ from multiprocess_cuda_infer import ModelData, Inference
71
+ model_path = os.path.join(model_dir, 'Kim_Vocal_1.onnx')
72
+ config_path = os.path.join(model_dir, 'MDX-Net-Kim-Vocal1.json')
73
+ configs = json.loads(open(config_path, 'r', encoding='utf-8').read())
74
+ model_data = ModelData(
75
+ model_path=model_path,
76
+ audio_path = model_dir,
77
+ result_path = model_dir,
78
+ device = 'cpu',
79
+ process_method = "MDX-Net",
80
+ base_dir=code_dir,
81
+ **configs
82
+ )
83
+
84
+ uvr5_model = Inference(model_data, 'cpu')
85
+ uvr5_model.load_model(model_path, 1)
86
+ return uvr5_model
87
+
88
+ def denoise(self, audio_info):
89
+ print("denoise UVR5: ", audio_info)
90
+ input_audio = load_wav(audio_info, sr=44100, channel=2)
91
+ output_audio = self.model.demix_base({0:input_audio.squeeze()}, is_match_mix=False)
92
+ # transform = torchaudio.transforms.Resample(44100, 16000)
93
+ # output_audio = transform(output_audio)
94
+ return output_audio.squeeze().T.numpy(), 44100
95
+
96
+
97
+ denoise_model = UVR5(UVR5_MODEL_DIR)
98
+
99
+ def load_wav(audio_info, sr=16000, channel=1):
100
+ print("load audio:", audio_info)
101
+ audio, raw_sr = torchaudio.load(audio_info)
102
+ audio = audio.T if len(audio.shape) > 1 and audio.shape[1] == 2 else audio
103
+ audio = audio / torch.max(torch.abs(audio))
104
+ audio = audio.squeeze().float()
105
+ if channel == 1 and len(audio.shape) == 2: # stereo to mono
106
+ audio = audio.mean(dim=0, keepdim=True)
107
+ elif channel == 2 and len(audio.shape) == 1:
108
+ audio = torch.stack((audio, audio)) # mono to stereo
109
+ if raw_sr != sr:
110
+ audio = torchaudio.functional.resample(audio.squeeze(), raw_sr, sr)
111
+ audio = torch.clip(audio, -0.999, 0.999).squeeze()
112
+ return audio
113
+
114
+
115
+ def denoise(audio_info):
116
+ save_path = "./denoised_audio.wav"
117
+ denoised_audio, sr = denoise_model.denoise(audio_info)
118
+ sf.write(save_path, denoised_audio, sr, format='wav', subtype='PCM_24')
119
+ print("save denoised audio:", save_path)
120
+ return save_path
121
+
122
+ def cancel_denoise(audio_info):
123
+ return audio_info
124
+
125
+
126
+ def get_checkpoints_project(project_name=None, is_gradio=True):
127
+ """Get available checkpoint files"""
128
+ checkpoint_dir = [str(CKPTS_ROOT)]
129
+ if project_name is None:
130
+ # Look for checkpoints in common locations
131
+ files_checkpoints = []
132
+ for path in checkpoint_dir:
133
+ if os.path.isdir(path):
134
+ files_checkpoints.extend(glob(os.path.join(path, "**/*.pt"), recursive=True))
135
+ files_checkpoints.extend(glob(os.path.join(path, "**/*.safetensors"), recursive=True))
136
+ break
137
+ else:
138
+ # project_name = project_name.replace("_pinyin", "").replace("_char", "")
139
+ project_name = "_".join(["F5TTS_v1_Base", "vocos", "custom", project_name.replace("_custom", "")]) if project_name != "F5TTS_v1_Base" else project_name
140
+ if os.path.isdir(checkpoint_dir[0]):
141
+ files_checkpoints = glob(os.path.join(checkpoint_dir[0], project_name, "*.pt"))
142
+ files_checkpoints.extend(glob(os.path.join(checkpoint_dir[0], project_name, "*.safetensors")))
143
+ else:
144
+ files_checkpoints = []
145
+ print("files_checkpoints:", project_name, files_checkpoints)
146
+ # Separate pretrained and regular checkpoints
147
+ pretrained_checkpoints = [f for f in files_checkpoints if "pretrained_" in os.path.basename(f)]
148
+ regular_checkpoints = [
149
+ f
150
+ for f in files_checkpoints
151
+ if "pretrained_" not in os.path.basename(f) and "model_last.pt" not in os.path.basename(f)
152
+ ]
153
+ last_checkpoint = [f for f in files_checkpoints if "model_last.pt" in os.path.basename(f)]
154
+
155
+ # Sort regular checkpoints by number
156
+ try:
157
+ regular_checkpoints = sorted(
158
+ regular_checkpoints, key=lambda x: int(os.path.basename(x).split("_")[1].split(".")[0])
159
+ )
160
+ except (IndexError, ValueError):
161
+ regular_checkpoints = sorted(regular_checkpoints)
162
+
163
+ # Combine in order: pretrained, regular, last
164
+ files_checkpoints = pretrained_checkpoints + regular_checkpoints + last_checkpoint
165
+
166
+ select_checkpoint = None if not files_checkpoints else files_checkpoints[-1]
167
+
168
+ if is_gradio:
169
+ return gr.update(choices=files_checkpoints, value=select_checkpoint)
170
+
171
+ return files_checkpoints, select_checkpoint
172
+
173
+
174
+ def get_available_projects():
175
+ """Get available project names from data directory"""
176
+ data_path = str(DATA_ROOT)
177
+
178
+ project_list = []
179
+ if os.path.isdir(data_path):
180
+ for folder in os.listdir(data_path):
181
+ if "test" in folder:
182
+ continue
183
+ project_list.append(folder)
184
+
185
+ # Fallback to a sensible default if no projects are found
186
+ if not project_list:
187
+ project_list = ["multilingual_acc_grl_custom"]
188
+
189
+ return project_list
190
+
191
+
192
+ def infer(
193
+ project, file_checkpoint, exp_name, ref_text, ref_audio, denoise_audio, gen_text, nfe_step, use_ema, separate_langs, frontend, speed, cfg_strength, use_acc_grl, ref_ratio, no_ref_audio, sway_sampling_coef, use_prosody_encoder, seed
194
+ ):
195
+ global last_checkpoint, last_device, tts_api, last_ema
196
+
197
+ if not os.path.isfile(file_checkpoint):
198
+ return None, "Checkpoint not found!", ""
199
+
200
+ if denoise_audio:
201
+ ref_audio = denoise_audio
202
+
203
+ device_test = device # Use the global device
204
+
205
+ if last_checkpoint != file_checkpoint or last_device != device_test or last_ema != use_ema or tts_api is None:
206
+ if last_checkpoint != file_checkpoint:
207
+ last_checkpoint = file_checkpoint
208
+
209
+ if last_device != device_test:
210
+ last_device = device_test
211
+
212
+ if last_ema != use_ema:
213
+ last_ema = use_ema
214
+
215
+ # Try to find vocab file
216
+ vocab_file = None
217
+ possible_vocab_paths = [
218
+ str(DATA_ROOT / project / "vocab.txt"),
219
+ # legacy fallbacks for older layouts
220
+ f"./data/{project}/vocab.txt",
221
+ f"../../data/{project}/vocab.txt",
222
+ "./data/Emilia_ZH_EN_pinyin/vocab.txt",
223
+ "../../data/Emilia_ZH_EN_pinyin/vocab.txt",
224
+ ]
225
+
226
+ for path in possible_vocab_paths:
227
+ if os.path.isfile(path):
228
+ vocab_file = path
229
+ break
230
+
231
+ if vocab_file is None:
232
+ return None, "Vocab file not found!", ""
233
+
234
+ try:
235
+ tts_api = F5TTS(
236
+ model=exp_name,
237
+ ckpt_file=file_checkpoint,
238
+ vocab_file=vocab_file,
239
+ device=device_test,
240
+ use_ema=use_ema,
241
+ frontend=frontend,
242
+ use_prosody_encoder=use_prosody_encoder,
243
+ prosody_cfg_path=str(CKPTS_ROOT / "prosody_encoder" / "pretssel_cfg.json"),
244
+ prosody_ckpt_path=str(CKPTS_ROOT / "prosody_encoder" / "prosody_encoder_UnitY2.pt"),
245
+ )
246
+ except Exception as e:
247
+ traceback.print_exc()
248
+ return None, f"Error loading model: {str(e)}", ""
249
+
250
+ print("Model loaded >>", device_test, file_checkpoint, use_ema)
251
+
252
+ if seed == -1: # -1 used for random
253
+ seed = None
254
+
255
+ try:
256
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
257
+ tts_api.infer(
258
+ ref_file=ref_audio,
259
+ ref_text=ref_text.strip(),
260
+ gen_text=gen_text.strip(),
261
+ nfe_step=nfe_step,
262
+ separate_langs=separate_langs,
263
+ speed=speed,
264
+ cfg_strength=cfg_strength,
265
+ sway_sampling_coef=sway_sampling_coef,
266
+ use_acc_grl=use_acc_grl,
267
+ ref_ratio=ref_ratio,
268
+ no_ref_audio=no_ref_audio,
269
+ use_prosody_encoder=use_prosody_encoder,
270
+ file_wave=f.name,
271
+ seed=seed,
272
+ )
273
+ return f.name, f"Device: {tts_api.device}", str(tts_api.seed)
274
+ except Exception as e:
275
+ traceback.print_exc()
276
+ return None, f"Inference error: {str(e)}", ""
277
+
278
+
279
+ def get_gpu_stats():
280
+ """Get GPU statistics"""
281
+ gpu_stats = ""
282
+
283
+ if torch.cuda.is_available():
284
+ gpu_count = torch.cuda.device_count()
285
+ for i in range(gpu_count):
286
+ gpu_name = torch.cuda.get_device_name(i)
287
+ gpu_properties = torch.cuda.get_device_properties(i)
288
+ total_memory = gpu_properties.total_memory / (1024**3) # in GB
289
+ allocated_memory = torch.cuda.memory_allocated(i) / (1024**2) # in MB
290
+ reserved_memory = torch.cuda.memory_reserved(i) / (1024**2) # in MB
291
+
292
+ gpu_stats += (
293
+ f"GPU {i} Name: {gpu_name}\n"
294
+ f"Total GPU memory (GPU {i}): {total_memory:.2f} GB\n"
295
+ f"Allocated GPU memory (GPU {i}): {allocated_memory:.2f} MB\n"
296
+ f"Reserved GPU memory (GPU {i}): {reserved_memory:.2f} MB\n\n"
297
+ )
298
+ elif torch.xpu.is_available():
299
+ gpu_count = torch.xpu.device_count()
300
+ for i in range(gpu_count):
301
+ gpu_name = torch.xpu.get_device_name(i)
302
+ gpu_properties = torch.xpu.get_device_properties(i)
303
+ total_memory = gpu_properties.total_memory / (1024**3) # in GB
304
+ allocated_memory = torch.xpu.memory_allocated(i) / (1024**2) # in MB
305
+ reserved_memory = torch.xpu.memory_reserved(i) / (1024**2) # in MB
306
+
307
+ gpu_stats += (
308
+ f"GPU {i} Name: {gpu_name}\n"
309
+ f"Total GPU memory (GPU {i}): {total_memory:.2f} GB\n"
310
+ f"Allocated GPU memory (GPU {i}): {allocated_memory:.2f} MB\n"
311
+ f"Reserved GPU memory (GPU {i}): {reserved_memory:.2f} MB\n\n"
312
+ )
313
+ elif torch.backends.mps.is_available():
314
+ gpu_count = 1
315
+ gpu_stats += "MPS GPU\n"
316
+ total_memory = psutil.virtual_memory().total / (
317
+ 1024**3
318
+ ) # Total system memory (MPS doesn't have its own memory)
319
+ allocated_memory = 0
320
+ reserved_memory = 0
321
+
322
+ gpu_stats += (
323
+ f"Total system memory: {total_memory:.2f} GB\n"
324
+ f"Allocated GPU memory (MPS): {allocated_memory:.2f} MB\n"
325
+ f"Reserved GPU memory (MPS): {reserved_memory:.2f} MB\n"
326
+ )
327
+
328
+ else:
329
+ gpu_stats = "No GPU available"
330
+
331
+ return gpu_stats
332
+
333
+
334
+ def get_cpu_stats():
335
+ """Get CPU statistics"""
336
+ cpu_usage = psutil.cpu_percent(interval=1)
337
+ memory_info = psutil.virtual_memory()
338
+ memory_used = memory_info.used / (1024**2)
339
+ memory_total = memory_info.total / (1024**2)
340
+ memory_percent = memory_info.percent
341
+
342
+ pid = os.getpid()
343
+ process = psutil.Process(pid)
344
+ nice_value = process.nice()
345
+
346
+ cpu_stats = (
347
+ f"CPU Usage: {cpu_usage:.2f}%\n"
348
+ f"System Memory: {memory_used:.2f} MB used / {memory_total:.2f} MB total ({memory_percent}% used)\n"
349
+ f"Process Priority (Nice value): {nice_value}"
350
+ )
351
+
352
+ return cpu_stats
353
+
354
+
355
+ def get_combined_stats():
356
+ """Get combined system stats"""
357
+ gpu_stats = get_gpu_stats()
358
+ cpu_stats = get_cpu_stats()
359
+ combined_stats = f"### GPU Stats\n{gpu_stats}\n\n### CPU Stats\n{cpu_stats}"
360
+ return combined_stats
361
+
362
+
363
+ # Create Gradio interface
364
+ with gr.Blocks(title="LEMAS-TTS Inference") as app:
365
+ gr.Markdown(
366
+ """
367
+ # Zero-Shot TTS
368
+
369
+ Set seed to -1 for random generation.
370
+ """
371
+ )
372
+ with gr.Accordion("Model configuration", open=False):
373
+ # Model configuration
374
+ with gr.Row():
375
+ exp_name = gr.Radio(
376
+ label="Model", choices=["F5TTS_v1_Base", "F5TTS_Base", "E2TTS_Base"], value="F5TTS_v1_Base", visible=False
377
+ )
378
+ # Project selection
379
+ available_projects = get_available_projects()
380
+
381
+ # Get initial checkpoints
382
+ list_checkpoints, checkpoint_select = get_checkpoints_project(available_projects[0] if available_projects else None, False)
383
+
384
+ with gr.Row():
385
+ with gr.Column(scale=1):
386
+ # load_models_btn = gr.Button(value="Load models")
387
+ cm_project = gr.Dropdown(
388
+ choices=available_projects,
389
+ value=available_projects[0] if available_projects else None,
390
+ label="Project",
391
+ allow_custom_value=True,
392
+ scale=4
393
+ )
394
+
395
+ with gr.Column(scale=5):
396
+ cm_checkpoint = gr.Dropdown(
397
+ choices=list_checkpoints, value=checkpoint_select, label="Checkpoints", allow_custom_value=True # scale=4,
398
+ )
399
+ bt_checkpoint_refresh = gr.Button("Refresh", scale=1)
400
+
401
+ with gr.Row():
402
+ ch_use_ema = gr.Checkbox(label="Use EMA", value=True, scale=2, info="Turn off at early stage might offer better results")
403
+ frontend = gr.Radio(label="Frontend", choices=["phone", "char", "bpe"], value="phone", scale=3)
404
+ separate_langs = gr.Checkbox(label="Separate Languages", value=True, scale=2, info="separate language tokens")
405
+
406
+ # Inference parameters
407
+ with gr.Row():
408
+ nfe_step = gr.Number(label="NFE Step", scale=1, value=64)
409
+ speed = gr.Slider(label="Speed", scale=3, value=1.0, minimum=0.5, maximum=1.5, step=0.1)
410
+ cfg_strength = gr.Slider(label="CFG Strength", scale=2, value=5.0, minimum=0.0, maximum=10.0, step=1)
411
+ sway_sampling_coef = gr.Slider(label="Sway Sampling Coef", scale=2, value=3, minimum=-1, maximum=5, step=0.1)
412
+ ref_ratio = gr.Slider(label="Ref Ratio", scale=2, value=1.0, minimum=0.0, maximum=1.0, step=0.1)
413
+ no_ref_audio = gr.Checkbox(label="No Reference Audio", value=False, scale=1, info="No mel condition")
414
+ use_acc_grl = gr.Checkbox(label="Use accent grl condition", value=False, scale=1, info="Use accent grl condition")
415
+ use_prosody_encoder = gr.Checkbox(label="Use prosody encoder", value=False, scale=1, info="Use prosody encoder")
416
+ seed = gr.Number(label="Random Seed", scale=1, value=5828684826493313192, minimum=-1)
417
+
418
+
419
+ # Input fields
420
+ ref_text = gr.Textbox(label="Reference Text", placeholder="Enter the text for the reference audio...")
421
+ ref_audio = gr.Audio(label="Reference Audio", type="filepath", interactive=True, show_download_button=True, editable=True)
422
+
423
+
424
+ with gr.Row():
425
+ denoise_btn = gr.Button(value="Denoise")
426
+ cancel_btn = gr.Button(value="Cancel Denoise")
427
+ denoise_audio = gr.Audio(label="Denoised Audio", value=None, type="filepath", interactive=True, show_download_button=True, editable=True)
428
+
429
+ gen_text = gr.Textbox(label="Text to Generate", placeholder="Enter the text you want to generate...")
430
+
431
+ # Inference button and outputs
432
+ with gr.Row():
433
+ txt_info_gpu = gr.Textbox("", label="Device Info")
434
+ seed_info = gr.Textbox(label="Used Random Seed")
435
+ check_button_infer = gr.Button("Generate Audio", variant="primary")
436
+
437
+ gen_audio = gr.Audio(label="Generated Audio", type="filepath", interactive=True, show_download_button=True, editable=True)
438
+
439
+ # Examples
440
+ examples = gr.Examples(
441
+ examples=[
442
+ [
443
+ "Ich glaub, mein Schwein pfeift.",
444
+ str(DATA_ROOT / "test_examples" / "de.wav"),
445
+ "我觉得我的猪在吹口哨。",
446
+ ],
447
+ [
448
+ "em, #1 I have a list of YouTubers, and I'm gonna be going to their houses and raiding them by.",
449
+ str(DATA_ROOT / "test_examples" / "en.wav"),
450
+ "我有一份 YouTuber 名单,我打算去他们家,对他们进行突袭。",
451
+ ],
452
+ [
453
+ "Te voy a dar un tip #1 que le copia a John Rockefeller, uno de los empresarios más picudos de la historia.",
454
+ str(DATA_ROOT / "test_examples" / "es.wav"),
455
+ "我要给你一个从历史上最精明的商人之一约翰·洛克菲勒那里抄来的秘诀。",
456
+ ],
457
+ [
458
+ "Per l'amor di Dio #1 fai, #2 se pensi di non poterti fermare, fallo #1 e fallo.",
459
+ str(DATA_ROOT / "test_examples" / "it.wav"),
460
+ "看在上帝的份上,去做吧,如果你认为你无法停止,那就去做吧,继续做下去。",
461
+ ],
462
+ [
463
+ "Nova, #1 dia 25 desse mês vai rolar operação the last Frontier.",
464
+ str(DATA_ROOT / "test_examples" / "pt.wav"),
465
+ "新消息,本月二十五日,'最后的边疆行动'将启动。",
466
+ ],
467
+ # ["Good morning! #1 ",
468
+ # "/mnt/code/lemas/F5-TTS/data/trueman/recognition_d0a02641c090813574a8ec398220339f_0.wav",
469
+ # " #1"
470
+ # ],
471
+ # ["Good morning! #1 ",
472
+ # "/mnt/code/lemas/F5-TTS/data/trueman/recognition_d0a02641c090813574a8ec398220339f_1.wav",
473
+ # " #1",
474
+ # ],
475
+ # ["Good morning! #1 ",
476
+ # "/mnt/code/lemas/F5-TTS/data/trueman/recognition_d0a02641c090813574a8ec398220339f_2.wav",
477
+ # " #1",
478
+ # ],
479
+ # ["Oh, and in case I don't see ya, #1",
480
+ # "/mnt/code/lemas/F5-TTS/data/trueman/recognition_d0a02641c090813574a8ec398220339f_3.wav",
481
+ # " #1",
482
+ # ],
483
+ # ["Good afternoon, good evening, and good night. #1",
484
+ # "/mnt/code/lemas/F5-TTS/data/trueman/recognition_d0a02641c090813574a8ec398220339f_4.wav",
485
+ # " #1",
486
+ # ],
487
+ ],
488
+ inputs=[
489
+ ref_text,
490
+ ref_audio,
491
+ gen_text,
492
+ ],
493
+ outputs=[gen_audio, txt_info_gpu, seed_info],
494
+ fn=infer,
495
+ cache_examples=False
496
+ )
497
+
498
+ # System Info section at the bottom
499
+ gr.Markdown("---")
500
+ gr.Markdown("## System Information")
501
+ with gr.Accordion("Update System Stats", open=False):
502
+ update_button = gr.Button("Update System Stats", scale=1)
503
+ output_box = gr.Textbox(label="GPU and CPU Information", lines=5, scale=5)
504
+
505
+ def update_stats():
506
+ return get_combined_stats()
507
+
508
+
509
+ denoise_btn.click(fn=denoise,
510
+ inputs=[ref_audio],
511
+ outputs=[denoise_audio])
512
+
513
+ cancel_btn.click(fn=cancel_denoise,
514
+ inputs=[ref_audio],
515
+ outputs=[denoise_audio])
516
+
517
+ # Event handlers
518
+ check_button_infer.click(
519
+ fn=infer,
520
+ inputs=[
521
+ cm_project,
522
+ cm_checkpoint,
523
+ exp_name,
524
+ ref_text,
525
+ ref_audio,
526
+ denoise_audio,
527
+ gen_text,
528
+ nfe_step,
529
+ ch_use_ema,
530
+ separate_langs,
531
+ frontend,
532
+ speed,
533
+ cfg_strength,
534
+ use_acc_grl,
535
+ ref_ratio,
536
+ no_ref_audio,
537
+ sway_sampling_coef,
538
+ use_prosody_encoder,
539
+ seed,
540
+ ],
541
+ outputs=[gen_audio, txt_info_gpu, seed_info],
542
+ )
543
+
544
+ bt_checkpoint_refresh.click(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
545
+ cm_project.change(fn=get_checkpoints_project, inputs=[cm_project], outputs=[cm_checkpoint])
546
+
547
+ ref_audio.change(
548
+ fn=lambda x: None,
549
+ inputs=[ref_audio],
550
+ outputs=[denoise_audio]
551
+ )
552
+
553
+ update_button.click(fn=update_stats, outputs=output_box)
554
+
555
+ # Auto-load system stats on startup
556
+ app.load(fn=update_stats, outputs=output_box)
557
+
558
+
559
+ @click.command()
560
+ @click.option("--port", "-p", default=7860, type=int, help="Port to run the app on")
561
+ @click.option("--host", "-H", default="0.0.0.0", help="Host to run the app on")
562
+ @click.option(
563
+ "--share",
564
+ "-s",
565
+ default=False,
566
+ is_flag=True,
567
+ help="Share the app via Gradio share link",
568
+ )
569
+ @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
570
+ def main(port, host, share, api):
571
+ global app
572
+ print("Starting LEMAS-TTS Inference Interface...")
573
+ print(f"Device: {device}")
574
+ app.queue(api_open=api).launch(
575
+ server_name=host,
576
+ server_port=port,
577
+ share=share,
578
+ show_api=api,
579
+ allowed_paths=[str(DATA_ROOT)],
580
+ )
581
+
582
+
583
+ if __name__ == "__main__":
584
+ main()
requirements.txt ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu121
2
+ faster-whisper==1.1.0
3
+ whisperx==3.1.1
4
+ accelerate>=0.33.0
5
+ aiofiles==23.2.1
6
+ aiohappyeyeballs==2.6.1
7
+ aiohttp==3.13.2
8
+ aiosignal==1.4.0
9
+ annotated-doc==0.0.4
10
+ annotated-types==0.7.0
11
+ antlr4-python3-runtime==4.9.3
12
+ anyio==4.12.0
13
+ attrs==25.4.0
14
+ audioread==3.1.0
15
+ babel==2.17.0
16
+ bitsandbytes>0.37.0; platform_machine != "arm64" and platform_system != "Darwin"
17
+ boto3==1.42.16
18
+ botocore==1.42.16
19
+ brotli==1.2.0
20
+ cached_path
21
+ cachetools==6.2.4
22
+ certifi==2025.11.12
23
+ cffi==2.0.0
24
+ charset-normalizer==3.4.4
25
+ click
26
+ contourpy==1.3.2
27
+ csvw==3.7.0
28
+ cycler==0.12.1
29
+ datasets
30
+ decorator==5.2.1
31
+ dill==0.4.0
32
+ dlinfo==2.0.0
33
+ docopt==0.6.2
34
+ einops==0.8.1
35
+ einx==0.3.0
36
+ ema-pytorch==0.7.3
37
+ encodec==0.1.1
38
+ espeakng==1.0.2
39
+ espeakng-loader==0.2.4
40
+ espeak_phonemizer==1.3.1
41
+ fastapi==0.127.0
42
+ ffmpy==1.0.0
43
+ filelock==3.20.1
44
+ fonttools==4.61.1
45
+ frozendict==2.4.7
46
+ frozenlist==1.8.0
47
+ fsspec==2025.10.0
48
+ gitdb==4.0.12
49
+ GitPython==3.1.45
50
+ google-api-core==2.28.1
51
+ google-auth==2.45.0
52
+ google-cloud-core==2.5.0
53
+ google-cloud-storage==3.7.0
54
+ google-crc32c==1.8.0
55
+ google-resumable-media==2.8.0
56
+ googleapis-common-protos==1.72.0
57
+ gradio==5.38.0
58
+ gradio-client==1.11.0
59
+ groovy==0.1.2
60
+ h11==0.16.0
61
+ hf-xet==1.2.0
62
+ httpcore==1.0.9
63
+ httpx==0.28.1
64
+ huggingface-hub==0.36.0
65
+ hydra-core>=1.3.0
66
+ idna==3.11
67
+ isodate==0.7.2
68
+ jieba
69
+ Jinja2==3.1.6
70
+ jmespath==1.0.1
71
+ joblib==1.5.3
72
+ jsonschema==4.25.1
73
+ jsonschema-specifications==2025.9.1
74
+ kiwisolver==1.4.9
75
+ langid==1.1.6
76
+ language-tags==1.2.0
77
+ lazy_loader==0.4
78
+ librosa
79
+ llvmlite==0.42.0
80
+ loguru==0.7.3
81
+ markdown-it-py==4.0.0
82
+ MarkupSafe
83
+ matplotlib
84
+ mdurl==0.1.2
85
+ mpmath==1.3.0
86
+ msgpack==1.1.2
87
+ multidict==6.7.0
88
+ multiprocess==0.70.18
89
+ networkx==3.1
90
+ num2words==0.5.13
91
+ numba==0.59.0
92
+ numpy==1.26.0
93
+ nvidia-cublas-cu12==12.1.3.1
94
+ nvidia-cuda-cupti-cu12==12.1.105
95
+ nvidia-cuda-nvrtc-cu12==12.1.105
96
+ nvidia-cuda-runtime-cu12==12.1.105
97
+ nvidia-cudnn-cu12==8.9.2.26
98
+ nvidia-cufft-cu12==11.0.2.54
99
+ nvidia-cufile-cu12==1.11.1.6
100
+ nvidia-curand-cu12==10.3.2.106
101
+ nvidia-cusolver-cu12==11.4.5.107
102
+ nvidia-cusparse-cu12==12.1.0.106
103
+ nvidia-cusparselt-cu12==0.6.3
104
+ nvidia-nccl-cu12==2.20.5
105
+ nvidia-nvjitlink-cu12==12.6.85
106
+ nvidia-nvtx-cu12==12.1.105
107
+ omegaconf==2.3.0
108
+ onnx==1.16.0
109
+ onnxruntime
110
+ onnxruntime-gpu
111
+ orjson==3.11.5
112
+ packaging==25.0
113
+ pandas==2.3.3
114
+ phonemizer==3.3.0
115
+ pillow==11.3.0
116
+ platformdirs==4.5.1
117
+ pooch==1.8.2
118
+ propcache==0.4.1
119
+ proto-plus==1.27.0
120
+ protobuf==6.33.2
121
+ psutil==7.2.0
122
+ pyarrow==22.0.0
123
+ pyasn1==0.6.1
124
+ pyasn1_modules==0.4.2
125
+ pycparser==2.23
126
+ pydantic<=2.10.6
127
+ pydantic_core==2.27.2
128
+ pydub
129
+ py-espeak-ng==0.1.8
130
+ Pygments==2.19.2
131
+ pyparsing==3.3.1
132
+ pypinyin
133
+ pypinyin-dict
134
+ python-dateutil==2.9.0.post0
135
+ python-multipart==0.0.21
136
+ pytz==2025.2
137
+ PyYAML==6.0.3
138
+ rdflib==7.5.0
139
+ referencing==0.37.0
140
+ regex
141
+ requests==2.32.5
142
+ rfc3986==1.5.0
143
+ rich==13.9.4
144
+ rpds-py==0.30.0
145
+ rsa==4.9.1
146
+ s3transfer==0.16.0
147
+ safehttpx==0.1.7
148
+ safetensors
149
+ scikit-learn==1.7.1
150
+ scipy==1.15.3
151
+ segments==2.3.0
152
+ semantic-version==2.10.0
153
+ sentry-sdk==2.48.0
154
+ setuptools==80.9.0
155
+ shellingham==1.5.4
156
+ six==1.17.0
157
+ smmap==5.0.2
158
+ soundfile
159
+ soxr==1.0.0
160
+ starlette==0.50.0
161
+ sympy==1.14.0
162
+ termcolor==3.2.0
163
+ threadpoolctl==3.6.0
164
+ tokenizers==0.22.1
165
+ tomli
166
+ tomlkit==0.13.3
167
+ torch==2.3.1
168
+ torchaudio==2.3.1
169
+ torchdiffeq==0.2.4
170
+ tqdm>=4.65.0
171
+ transformers
172
+ transformers-stream-generator
173
+ triton==2.3.1
174
+ typer==0.16.0
175
+ typing_extensions==4.12.2
176
+ tzdata==2025.3
177
+ uritemplate==4.2.0
178
+ urllib3==2.6.2
179
+ uroman
180
+ uvicorn==0.40.0
181
+ vocos
182
+ x-transformers>=1.31.14
183
+ xxhash==3.6.0
184
+ yarl==1.22.0
185
+ zhconv
uvr5/gui_data/constants.py ADDED
@@ -0,0 +1,1147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import platform
2
+
3
+ #Platform Details
4
+ OPERATING_SYSTEM = platform.system()
5
+ SYSTEM_ARCH = platform.platform()
6
+ SYSTEM_PROC = platform.processor()
7
+ ARM = 'arm'
8
+
9
+ #Main Font
10
+ MAIN_FONT_NAME = "Century Gothic"
11
+
12
+ #Model Types
13
+ VR_ARCH_TYPE = 'VR Arc'
14
+ MDX_ARCH_TYPE = 'MDX-Net'
15
+ DEMUCS_ARCH_TYPE = 'Demucs'
16
+ VR_ARCH_PM = 'VR Architecture'
17
+ ENSEMBLE_MODE = 'Ensemble Mode'
18
+ ENSEMBLE_STEM_CHECK = 'Ensemble Stem'
19
+ SECONDARY_MODEL = 'Secondary Model'
20
+ DEMUCS_6_STEM_MODEL = 'htdemucs_6s'
21
+
22
+ DEMUCS_V3_ARCH_TYPE = 'Demucs v3'
23
+ DEMUCS_V4_ARCH_TYPE = 'Demucs v4'
24
+ DEMUCS_NEWER_ARCH_TYPES = [DEMUCS_V3_ARCH_TYPE, DEMUCS_V4_ARCH_TYPE]
25
+
26
+ DEMUCS_V1 = 'v1'
27
+ DEMUCS_V2 = 'v2'
28
+ DEMUCS_V3 = 'v3'
29
+ DEMUCS_V4 = 'v4'
30
+
31
+ DEMUCS_V1_TAG = 'v1 | '
32
+ DEMUCS_V2_TAG = 'v2 | '
33
+ DEMUCS_V3_TAG = 'v3 | '
34
+ DEMUCS_V4_TAG = 'v4 | '
35
+ DEMUCS_NEWER_TAGS = [DEMUCS_V3_TAG, DEMUCS_V4_TAG]
36
+
37
+ DEMUCS_VERSION_MAPPER = {
38
+ DEMUCS_V1:DEMUCS_V1_TAG,
39
+ DEMUCS_V2:DEMUCS_V2_TAG,
40
+ DEMUCS_V3:DEMUCS_V3_TAG,
41
+ DEMUCS_V4:DEMUCS_V4_TAG}
42
+
43
+ #Download Center
44
+ DOWNLOAD_FAILED = 'Download Failed'
45
+ DOWNLOAD_STOPPED = 'Download Stopped'
46
+ DOWNLOAD_COMPLETE = 'Download Complete'
47
+ DOWNLOAD_UPDATE_COMPLETE = 'Update Download Complete'
48
+ SETTINGS_MENU_EXIT = 'exit'
49
+ NO_CONNECTION = 'No Internet Connection'
50
+ VIP_SELECTION = 'VIP:'
51
+ DEVELOPER_SELECTION = 'VIP:'
52
+ NO_NEW_MODELS = 'All Available Models Downloaded'
53
+ ENSEMBLE_PARTITION = ': '
54
+ NO_MODEL = 'No Model Selected'
55
+ CHOOSE_MODEL = 'Choose Model'
56
+ SINGLE_DOWNLOAD = 'Downloading Item 1/1...'
57
+ DOWNLOADING_ITEM = 'Downloading Item'
58
+ FILE_EXISTS = 'File already exists!'
59
+ DOWNLOADING_UPDATE = 'Downloading Update...'
60
+ DOWNLOAD_MORE = 'Download More Models'
61
+
62
+ #Menu Options
63
+
64
+ AUTO_SELECT = 'Auto'
65
+
66
+ #LINKS
67
+ DOWNLOAD_CHECKS = "https://raw.githubusercontent.com/TRvlvr/application_data/main/filelists/download_checks.json"
68
+ MDX_MODEL_DATA_LINK = "https://raw.githubusercontent.com/TRvlvr/application_data/main/mdx_model_data/model_data.json"
69
+ VR_MODEL_DATA_LINK = "https://raw.githubusercontent.com/TRvlvr/application_data/main/vr_model_data/model_data.json"
70
+
71
+ DEMUCS_MODEL_NAME_DATA_LINK = "https://raw.githubusercontent.com/TRvlvr/application_data/main/demucs_model_data/model_name_mapper.json"
72
+ MDX_MODEL_NAME_DATA_LINK = "https://raw.githubusercontent.com/TRvlvr/application_data/main/mdx_model_data/model_name_mapper.json"
73
+
74
+ DONATE_LINK_BMAC = "https://www.buymeacoffee.com/uvr5"
75
+ DONATE_LINK_PATREON = "https://www.patreon.com/uvr"
76
+
77
+ #DOWNLOAD REPOS
78
+ NORMAL_REPO = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/"
79
+ UPDATE_REPO = "https://github.com/TRvlvr/model_repo/releases/download/uvr_update_patches/"
80
+
81
+ UPDATE_MAC_ARM_REPO = "https://github.com/Anjok07/ultimatevocalremovergui/releases/download/v5.5.0/Ultimate_Vocal_Remover_v5_5_MacOS_arm64.dmg"
82
+ UPDATE_MAC_X86_64_REPO = "https://github.com/Anjok07/ultimatevocalremovergui/releases/download/v5.5.0/Ultimate_Vocal_Remover_v5_5_MacOS_x86_64.dmg"
83
+ UPDATE_LINUX_REPO = "https://github.com/Anjok07/ultimatevocalremovergui#linux-installation"
84
+ UPDATE_REPO = "https://github.com/TRvlvr/model_repo/releases/download/uvr_update_patches/"
85
+
86
+ ISSUE_LINK = 'https://github.com/Anjok07/ultimatevocalremovergui/issues/new'
87
+ VIP_REPO = b'\xf3\xc2W\x19\x1foI)\xc2\xa9\xcc\xb67(Z\xf5',\
88
+ b'gAAAAABjQAIQ-NpNMMxMedpKHHb7ze_nqB05hw0YhbOy3pFzuzDrfqumn8_qvraxEoUpZC5ZXC0gGvfDxFMqyq9VWbYKlA67SUFI_wZB6QoVyGI581vs7kaGfUqlXHIdDS6tQ_U-BfjbEAK9EU_74-R2zXjz8Xzekw=='
89
+ NO_CODE = 'incorrect_code'
90
+
91
+ #Extensions
92
+
93
+ ONNX = '.onnx'
94
+ CKPT = '.ckpt'
95
+ YAML = '.yaml'
96
+ PTH = '.pth'
97
+ TH_EXT = '.th'
98
+ JSON = '.json'
99
+
100
+ #GUI Buttons
101
+
102
+ START_PROCESSING = 'Start Processing'
103
+ WAIT_PROCESSING = 'Please wait...'
104
+ STOP_PROCESSING = 'Halting process, please wait...'
105
+ LOADING_MODELS = 'Loading models...'
106
+
107
+ #---Messages and Logs----
108
+
109
+ MISSING_MODEL = 'missing'
110
+ MODEL_PRESENT = 'present'
111
+
112
+ UNRECOGNIZED_MODEL = 'Unrecognized Model Detected', ' is an unrecognized model.\n\n' + \
113
+ 'Would you like to select the correct parameters before continuing?'
114
+
115
+ STOP_PROCESS_CONFIRM = 'Confirmation', 'You are about to stop all active processes.\n\nAre you sure you wish to continue?'
116
+ NO_ENSEMBLE_SELECTED = 'No Models Selected', 'Please select ensemble and try again.'
117
+ PICKLE_CORRU = 'File Corrupted', 'Unable to load this ensemble.\n\n' + \
118
+ 'Would you like to remove this ensemble from your list?'
119
+ DELETE_ENS_ENTRY = 'Confirm Removal', 'Are you sure you want to remove this entry?'
120
+
121
+ ALL_STEMS = 'All Stems'
122
+ VOCAL_STEM = 'Vocals'
123
+ INST_STEM = 'Instrumental'
124
+ OTHER_STEM = 'Other'
125
+ BASS_STEM = 'Bass'
126
+ DRUM_STEM = 'Drums'
127
+ GUITAR_STEM = 'Guitar'
128
+ PIANO_STEM = 'Piano'
129
+ SYNTH_STEM = 'Synthesizer'
130
+ STRINGS_STEM = 'Strings'
131
+ WOODWINDS_STEM = 'Woodwinds'
132
+ BRASS_STEM = 'Brass'
133
+ WIND_INST_STEM = 'Wind Inst'
134
+ NO_OTHER_STEM = 'No Other'
135
+ NO_BASS_STEM = 'No Bass'
136
+ NO_DRUM_STEM = 'No Drums'
137
+ NO_GUITAR_STEM = 'No Guitar'
138
+ NO_PIANO_STEM = 'No Piano'
139
+ NO_SYNTH_STEM = 'No Synthesizer'
140
+ NO_STRINGS_STEM = 'No Strings'
141
+ NO_WOODWINDS_STEM = 'No Woodwinds'
142
+ NO_WIND_INST_STEM = 'No Wind Inst'
143
+ NO_BRASS_STEM = 'No Brass'
144
+ PRIMARY_STEM = 'Primary Stem'
145
+ SECONDARY_STEM = 'Secondary Stem'
146
+
147
+ #Other Constants
148
+ DEMUCS_2_SOURCE = ["instrumental", "vocals"]
149
+ DEMUCS_4_SOURCE = ["drums", "bass", "other", "vocals"]
150
+
151
+ DEMUCS_2_SOURCE_MAPPER = {
152
+ INST_STEM: 0,
153
+ VOCAL_STEM: 1}
154
+
155
+ DEMUCS_4_SOURCE_MAPPER = {
156
+ BASS_STEM: 0,
157
+ DRUM_STEM: 1,
158
+ OTHER_STEM: 2,
159
+ VOCAL_STEM: 3}
160
+
161
+ DEMUCS_6_SOURCE_MAPPER = {
162
+ BASS_STEM: 0,
163
+ DRUM_STEM: 1,
164
+ OTHER_STEM: 2,
165
+ VOCAL_STEM: 3,
166
+ GUITAR_STEM:4,
167
+ PIANO_STEM:5}
168
+
169
+ DEMUCS_4_SOURCE_LIST = [BASS_STEM, DRUM_STEM, OTHER_STEM, VOCAL_STEM]
170
+ DEMUCS_6_SOURCE_LIST = [BASS_STEM, DRUM_STEM, OTHER_STEM, VOCAL_STEM, GUITAR_STEM, PIANO_STEM]
171
+
172
+ DEMUCS_UVR_MODEL = 'UVR_Model'
173
+
174
+ CHOOSE_STEM_PAIR = 'Choose Stem Pair'
175
+
176
+ STEM_SET_MENU = (VOCAL_STEM,
177
+ INST_STEM,
178
+ OTHER_STEM,
179
+ BASS_STEM,
180
+ DRUM_STEM,
181
+ GUITAR_STEM,
182
+ PIANO_STEM,
183
+ SYNTH_STEM,
184
+ STRINGS_STEM,
185
+ WOODWINDS_STEM,
186
+ BRASS_STEM,
187
+ WIND_INST_STEM,
188
+ NO_OTHER_STEM,
189
+ NO_BASS_STEM,
190
+ NO_DRUM_STEM,
191
+ NO_GUITAR_STEM,
192
+ NO_PIANO_STEM,
193
+ NO_SYNTH_STEM,
194
+ NO_STRINGS_STEM,
195
+ NO_WOODWINDS_STEM,
196
+ NO_BRASS_STEM,
197
+ NO_WIND_INST_STEM)
198
+
199
+ STEM_PAIR_MAPPER = {
200
+ VOCAL_STEM: INST_STEM,
201
+ INST_STEM: VOCAL_STEM,
202
+ OTHER_STEM: NO_OTHER_STEM,
203
+ BASS_STEM: NO_BASS_STEM,
204
+ DRUM_STEM: NO_DRUM_STEM,
205
+ GUITAR_STEM: NO_GUITAR_STEM,
206
+ PIANO_STEM: NO_PIANO_STEM,
207
+ SYNTH_STEM: NO_SYNTH_STEM,
208
+ STRINGS_STEM: NO_STRINGS_STEM,
209
+ WOODWINDS_STEM: NO_WOODWINDS_STEM,
210
+ BRASS_STEM: NO_BRASS_STEM,
211
+ WIND_INST_STEM: NO_WIND_INST_STEM,
212
+ NO_OTHER_STEM: OTHER_STEM,
213
+ NO_BASS_STEM: BASS_STEM,
214
+ NO_DRUM_STEM: DRUM_STEM,
215
+ NO_GUITAR_STEM: GUITAR_STEM,
216
+ NO_PIANO_STEM: PIANO_STEM,
217
+ NO_SYNTH_STEM: SYNTH_STEM,
218
+ NO_STRINGS_STEM: STRINGS_STEM,
219
+ NO_WOODWINDS_STEM: WOODWINDS_STEM,
220
+ NO_BRASS_STEM: BRASS_STEM,
221
+ NO_WIND_INST_STEM: WIND_INST_STEM,
222
+ PRIMARY_STEM: SECONDARY_STEM}
223
+
224
+ NON_ACCOM_STEMS = (
225
+ VOCAL_STEM,
226
+ OTHER_STEM,
227
+ BASS_STEM,
228
+ DRUM_STEM,
229
+ GUITAR_STEM,
230
+ PIANO_STEM,
231
+ SYNTH_STEM,
232
+ STRINGS_STEM,
233
+ WOODWINDS_STEM,
234
+ BRASS_STEM,
235
+ WIND_INST_STEM)
236
+
237
+ MDX_NET_FREQ_CUT = [VOCAL_STEM, INST_STEM]
238
+
239
+ DEMUCS_4_STEM_OPTIONS = (ALL_STEMS, VOCAL_STEM, OTHER_STEM, BASS_STEM, DRUM_STEM)
240
+ DEMUCS_6_STEM_OPTIONS = (ALL_STEMS, VOCAL_STEM, OTHER_STEM, BASS_STEM, DRUM_STEM, GUITAR_STEM, PIANO_STEM)
241
+ DEMUCS_2_STEM_OPTIONS = (VOCAL_STEM, INST_STEM)
242
+ DEMUCS_4_STEM_CHECK = (OTHER_STEM, BASS_STEM, DRUM_STEM)
243
+
244
+ #Menu Dropdowns
245
+
246
+ VOCAL_PAIR = f'{VOCAL_STEM}/{INST_STEM}'
247
+ INST_PAIR = f'{INST_STEM}/{VOCAL_STEM}'
248
+ OTHER_PAIR = f'{OTHER_STEM}/{NO_OTHER_STEM}'
249
+ DRUM_PAIR = f'{DRUM_STEM}/{NO_DRUM_STEM}'
250
+ BASS_PAIR = f'{BASS_STEM}/{NO_BASS_STEM}'
251
+ FOUR_STEM_ENSEMBLE = '4 Stem Ensemble'
252
+
253
+ ENSEMBLE_MAIN_STEM = (CHOOSE_STEM_PAIR, VOCAL_PAIR, OTHER_PAIR, DRUM_PAIR, BASS_PAIR, FOUR_STEM_ENSEMBLE)
254
+
255
+ MIN_SPEC = 'Min Spec'
256
+ MAX_SPEC = 'Max Spec'
257
+ AUDIO_AVERAGE = 'Average'
258
+
259
+ MAX_MIN = f'{MAX_SPEC}/{MIN_SPEC}'
260
+ MAX_MAX = f'{MAX_SPEC}/{MAX_SPEC}'
261
+ MAX_AVE = f'{MAX_SPEC}/{AUDIO_AVERAGE}'
262
+ MIN_MAX = f'{MIN_SPEC}/{MAX_SPEC}'
263
+ MIN_MIX = f'{MIN_SPEC}/{MIN_SPEC}'
264
+ MIN_AVE = f'{MIN_SPEC}/{AUDIO_AVERAGE}'
265
+ AVE_MAX = f'{AUDIO_AVERAGE}/{MAX_SPEC}'
266
+ AVE_MIN = f'{AUDIO_AVERAGE}/{MIN_SPEC}'
267
+ AVE_AVE = f'{AUDIO_AVERAGE}/{AUDIO_AVERAGE}'
268
+
269
+ ENSEMBLE_TYPE = (MAX_MIN, MAX_MAX, MAX_AVE, MIN_MAX, MIN_MIX, MIN_AVE, AVE_MAX, AVE_MIN, AVE_AVE)
270
+ ENSEMBLE_TYPE_4_STEM = (MAX_SPEC, MIN_SPEC, AUDIO_AVERAGE)
271
+
272
+ BATCH_MODE = 'Batch Mode'
273
+ BETA_VERSION = 'BETA'
274
+ DEF_OPT = 'Default'
275
+
276
+ CHUNKS = (AUTO_SELECT, '1', '5', '10', '15', '20',
277
+ '25', '30', '35', '40', '45', '50',
278
+ '55', '60', '65', '70', '75', '80',
279
+ '85', '90', '95', 'Full')
280
+
281
+ BATCH_SIZE = (DEF_OPT, '2', '3', '4', '5',
282
+ '6', '7', '8', '9', '10')
283
+
284
+ VOL_COMPENSATION = (AUTO_SELECT, '1.035', '1.08')
285
+
286
+ MARGIN_SIZE = ('44100', '22050', '11025')
287
+
288
+ AUDIO_TOOLS = 'Audio Tools'
289
+
290
+ MANUAL_ENSEMBLE = 'Manual Ensemble'
291
+ TIME_STRETCH = 'Time Stretch'
292
+ CHANGE_PITCH = 'Change Pitch'
293
+ ALIGN_INPUTS = 'Align Inputs'
294
+
295
+ if OPERATING_SYSTEM == 'Windows' or OPERATING_SYSTEM == 'Darwin':
296
+ AUDIO_TOOL_OPTIONS = (MANUAL_ENSEMBLE, TIME_STRETCH, CHANGE_PITCH, ALIGN_INPUTS)
297
+ else:
298
+ AUDIO_TOOL_OPTIONS = (MANUAL_ENSEMBLE, ALIGN_INPUTS)
299
+
300
+ MANUAL_ENSEMBLE_OPTIONS = (MIN_SPEC, MAX_SPEC, AUDIO_AVERAGE)
301
+
302
+ PROCESS_METHODS = (VR_ARCH_PM, MDX_ARCH_TYPE, DEMUCS_ARCH_TYPE, ENSEMBLE_MODE, AUDIO_TOOLS)
303
+
304
+ DEMUCS_SEGMENTS = ('Default', '1', '5', '10', '15', '20',
305
+ '25', '30', '35', '40', '45', '50',
306
+ '55', '60', '65', '70', '75', '80',
307
+ '85', '90', '95', '100')
308
+
309
+ DEMUCS_SHIFTS = (0, 1, 2, 3, 4, 5,
310
+ 6, 7, 8, 9, 10, 11,
311
+ 12, 13, 14, 15, 16, 17,
312
+ 18, 19, 20)
313
+
314
+ DEMUCS_OVERLAP = (0.25, 0.50, 0.75, 0.99)
315
+
316
+ VR_AGGRESSION = (1, 2, 3, 4, 5,
317
+ 6, 7, 8, 9, 10, 11,
318
+ 12, 13, 14, 15, 16, 17,
319
+ 18, 19, 20)
320
+
321
+ VR_WINDOW = ('320', '512','1024')
322
+ VR_CROP = ('256', '512', '1024')
323
+ POST_PROCESSES_THREASHOLD_VALUES = ('0.1', '0.2', '0.3')
324
+
325
+ MDX_POP_PRO = ('MDX-NET_Noise_Profile_14_kHz', 'MDX-NET_Noise_Profile_17_kHz', 'MDX-NET_Noise_Profile_Full_Band')
326
+ MDX_POP_STEMS = ('Vocals', 'Instrumental', 'Other', 'Drums', 'Bass')
327
+ MDX_POP_NFFT = ('4096', '5120', '6144', '7680', '8192', '16384')
328
+ MDX_POP_DIMF = ('2048', '3072', '4096')
329
+
330
+ SAVE_ENSEMBLE = 'Save Ensemble'
331
+ CLEAR_ENSEMBLE = 'Clear Selection(s)'
332
+ MENU_SEPARATOR = 35*'•'
333
+ CHOOSE_ENSEMBLE_OPTION = 'Choose Option'
334
+
335
+ INVALID_ENTRY = 'Invalid Input, Please Try Again'
336
+ ENSEMBLE_INPUT_RULE = '1. Only letters, numbers, spaces, and dashes allowed.\n2. No dashes or spaces at the start or end of input.'
337
+
338
+ ENSEMBLE_OPTIONS = (SAVE_ENSEMBLE, CLEAR_ENSEMBLE)
339
+ ENSEMBLE_CHECK = 'ensemble check'
340
+
341
+ SELECT_SAVED_ENSEMBLE = 'Select Saved Ensemble'
342
+ SELECT_SAVED_SETTING = 'Select Saved Setting'
343
+ ENSEMBLE_OPTION = "Ensemble Customization Options"
344
+ MDX_OPTION = "Advanced MDX-Net Options"
345
+ DEMUCS_OPTION = "Advanced Demucs Options"
346
+ VR_OPTION = "Advanced VR Options"
347
+ HELP_OPTION = "Open Information Guide"
348
+ ERROR_OPTION = "Open Error Log"
349
+ VERIFY_BEGIN = 'Verifying file '
350
+ SAMPLE_BEGIN = 'Creating Sample '
351
+ MODEL_MISSING_CHECK = 'Model Missing:'
352
+
353
+ # Audio Player
354
+
355
+ PLAYING_SONG = ": Playing"
356
+ PAUSE_SONG = ": Paused"
357
+ STOP_SONG = ": Stopped"
358
+
359
+ SELECTED_VER = 'Selected'
360
+ DETECTED_VER = 'Detected'
361
+
362
+ SAMPLE_MODE_CHECKBOX = lambda v:f'Sample Mode ({v}s)'
363
+ REMOVED_FILES = lambda r, e:f'Audio Input Verification Report:\n\nRemoved Files:\n\n{r}\n\nError Details:\n\n{e}'
364
+ ADVANCED_SETTINGS = (ENSEMBLE_OPTION, MDX_OPTION, DEMUCS_OPTION, VR_OPTION, HELP_OPTION, ERROR_OPTION)
365
+
366
+ WAV = 'WAV'
367
+ FLAC = 'FLAC'
368
+ MP3 = 'MP3'
369
+
370
+ MP3_BIT_RATES = ('96k', '128k', '160k', '224k', '256k', '320k')
371
+ WAV_TYPE = ('PCM_U8', 'PCM_16', 'PCM_24', 'PCM_32', '32-bit Float', '64-bit Float')
372
+
373
+ SELECT_SAVED_SET = 'Choose Option'
374
+ SAVE_SETTINGS = 'Save Current Settings'
375
+ RESET_TO_DEFAULT = 'Reset to Default'
376
+ RESET_FULL_TO_DEFAULT = 'Reset to Default'
377
+ RESET_PM_TO_DEFAULT = 'Reset All Application Settings to Default'
378
+
379
+ SAVE_SET_OPTIONS = (SAVE_SETTINGS, RESET_TO_DEFAULT)
380
+
381
+ TIME_PITCH = ('1.0', '2.0', '3.0', '4.0')
382
+ TIME_TEXT = '_time_stretched'
383
+ PITCH_TEXT = '_pitch_shifted'
384
+
385
+ #RegEx Input Validation
386
+
387
+ REG_PITCH = r'^[-+]?(1[0]|[0-9]([.][0-9]*)?)$'
388
+ REG_TIME = r'^[+]?(1[0]|[0-9]([.][0-9]*)?)$'
389
+ REG_COMPENSATION = r'\b^(1[0]|[0-9]([.][0-9]*)?|Auto|None)$\b'
390
+ REG_THES_POSTPORCESS = r'\b^([0]([.][0-9]{0,6})?)$\b'
391
+ REG_CHUNKS = r'\b^(200|1[0-9][0-9]|[1-9][0-9]?|Auto|Full)$\b'
392
+ REG_CHUNKS_DEMUCS = r'\b^(200|1[0-9][0-9]|[1-9][0-9]?|Auto|Full)$\b'
393
+ REG_MARGIN = r'\b^[0-9]*$\b'
394
+ REG_SEGMENTS = r'\b^(200|1[0-9][0-9]|[1-9][0-9]?|Default)$\b'
395
+ REG_SAVE_INPUT = r'\b^([a-zA-Z0-9 -]{0,25})$\b'
396
+ REG_AGGRESSION = r'^[-+]?[0-9]\d*?$'
397
+ REG_WINDOW = r'\b^[0-9]{0,4}$\b'
398
+ REG_SHIFTS = r'\b^[0-9]*$\b'
399
+ REG_BATCHES = r'\b^([0-9]*?|Default)$\b'
400
+ REG_OVERLAP = r'\b^([0]([.][0-9]{0,6})?|None)$\b'
401
+
402
+ # Sub Menu
403
+
404
+ VR_ARCH_SETTING_LOAD = 'Load for VR Arch'
405
+ MDX_SETTING_LOAD = 'Load for MDX-Net'
406
+ DEMUCS_SETTING_LOAD = 'Load for Demucs'
407
+ ALL_ARCH_SETTING_LOAD = 'Load for Full Application'
408
+
409
+ # Mappers
410
+
411
+ DEFAULT_DATA = {
412
+
413
+ 'chosen_process_method': MDX_ARCH_TYPE,
414
+ 'vr_model': CHOOSE_MODEL,
415
+ 'aggression_setting': 10,
416
+ 'window_size': 512,
417
+ 'batch_size': 4,
418
+ 'crop_size': 256,
419
+ 'is_tta': False,
420
+ 'is_output_image': False,
421
+ 'is_post_process': False,
422
+ 'is_high_end_process': False,
423
+ 'post_process_threshold': 0.2,
424
+ 'vr_voc_inst_secondary_model': NO_MODEL,
425
+ 'vr_other_secondary_model': NO_MODEL,
426
+ 'vr_bass_secondary_model': NO_MODEL,
427
+ 'vr_drums_secondary_model': NO_MODEL,
428
+ 'vr_is_secondary_model_activate': False,
429
+ 'vr_voc_inst_secondary_model_scale': 0.9,
430
+ 'vr_other_secondary_model_scale': 0.7,
431
+ 'vr_bass_secondary_model_scale': 0.5,
432
+ 'vr_drums_secondary_model_scale': 0.5,
433
+ 'demucs_model': CHOOSE_MODEL,
434
+ 'demucs_stems': ALL_STEMS,
435
+ 'segment': DEMUCS_SEGMENTS[0],
436
+ 'overlap': DEMUCS_OVERLAP[0],
437
+ 'shifts': 2,
438
+ 'chunks_demucs': CHUNKS[0],
439
+ 'margin_demucs': 44100,
440
+ 'is_chunk_demucs': False,
441
+ 'is_chunk_mdxnet': False,
442
+ 'is_primary_stem_only_Demucs': False,
443
+ 'is_secondary_stem_only_Demucs': False,
444
+ 'is_split_mode': True,
445
+ 'is_demucs_combine_stems': True,
446
+ 'demucs_voc_inst_secondary_model': NO_MODEL,
447
+ 'demucs_other_secondary_model': NO_MODEL,
448
+ 'demucs_bass_secondary_model': NO_MODEL,
449
+ 'demucs_drums_secondary_model': NO_MODEL,
450
+ 'demucs_is_secondary_model_activate': False,
451
+ 'demucs_voc_inst_secondary_model_scale': 0.9,
452
+ 'demucs_other_secondary_model_scale': 0.7,
453
+ 'demucs_bass_secondary_model_scale': 0.5,
454
+ 'demucs_drums_secondary_model_scale': 0.5,
455
+ 'demucs_stems': ALL_STEMS,
456
+ 'demucs_pre_proc_model': NO_MODEL,
457
+ 'is_demucs_pre_proc_model_activate': False,
458
+ 'is_demucs_pre_proc_model_inst_mix': False,
459
+ 'mdx_net_model': CHOOSE_MODEL,
460
+ 'chunks': CHUNKS[0],
461
+ 'margin': 44100,
462
+ 'compensate': AUTO_SELECT,
463
+ 'is_denoise': False,
464
+ 'is_invert_spec': False,
465
+ 'is_mixer_mode': False,
466
+ 'mdx_batch_size': DEF_OPT,
467
+ 'mdx_voc_inst_secondary_model': NO_MODEL,
468
+ 'mdx_other_secondary_model': NO_MODEL,
469
+ 'mdx_bass_secondary_model': NO_MODEL,
470
+ 'mdx_drums_secondary_model': NO_MODEL,
471
+ 'mdx_is_secondary_model_activate': False,
472
+ 'mdx_voc_inst_secondary_model_scale': 0.9,
473
+ 'mdx_other_secondary_model_scale': 0.7,
474
+ 'mdx_bass_secondary_model_scale': 0.5,
475
+ 'mdx_drums_secondary_model_scale': 0.5,
476
+ 'is_save_all_outputs_ensemble': True,
477
+ 'is_append_ensemble_name': False,
478
+ 'chosen_audio_tool': AUDIO_TOOL_OPTIONS[0],
479
+ 'choose_algorithm': MANUAL_ENSEMBLE_OPTIONS[0],
480
+ 'time_stretch_rate': 2.0,
481
+ 'pitch_rate': 2.0,
482
+ 'is_gpu_conversion': False,
483
+ 'is_primary_stem_only': False,
484
+ 'is_secondary_stem_only': False,
485
+ 'is_testing_audio': False,
486
+ 'is_add_model_name': False,
487
+ 'is_accept_any_input': False,
488
+ 'is_task_complete': False,
489
+ 'is_normalization': False,
490
+ 'is_create_model_folder': False,
491
+ 'mp3_bit_set': '320k',
492
+ 'save_format': WAV,
493
+ 'wav_type_set': 'PCM_16',
494
+ 'user_code': '',
495
+ 'export_path': '',
496
+ 'input_paths': [],
497
+ 'lastDir': None,
498
+ 'export_path': '',
499
+ 'model_hash_table': None,
500
+ 'help_hints_var': False,
501
+ 'model_sample_mode': False,
502
+ 'model_sample_mode_duration': 30
503
+ }
504
+
505
+ SETTING_CHECK = ('vr_model',
506
+ 'aggression_setting',
507
+ 'window_size',
508
+ 'batch_size',
509
+ 'crop_size',
510
+ 'is_tta',
511
+ 'is_output_image',
512
+ 'is_post_process',
513
+ 'is_high_end_process',
514
+ 'post_process_threshold',
515
+ 'vr_voc_inst_secondary_model',
516
+ 'vr_other_secondary_model',
517
+ 'vr_bass_secondary_model',
518
+ 'vr_drums_secondary_model',
519
+ 'vr_is_secondary_model_activate',
520
+ 'vr_voc_inst_secondary_model_scale',
521
+ 'vr_other_secondary_model_scale',
522
+ 'vr_bass_secondary_model_scale',
523
+ 'vr_drums_secondary_model_scale',
524
+ 'demucs_model',
525
+ 'segment',
526
+ 'overlap',
527
+ 'shifts',
528
+ 'chunks_demucs',
529
+ 'margin_demucs',
530
+ 'is_chunk_demucs',
531
+ 'is_primary_stem_only_Demucs',
532
+ 'is_secondary_stem_only_Demucs',
533
+ 'is_split_mode',
534
+ 'is_demucs_combine_stems',
535
+ 'demucs_voc_inst_secondary_model',
536
+ 'demucs_other_secondary_model',
537
+ 'demucs_bass_secondary_model',
538
+ 'demucs_drums_secondary_model',
539
+ 'demucs_is_secondary_model_activate',
540
+ 'demucs_voc_inst_secondary_model_scale',
541
+ 'demucs_other_secondary_model_scale',
542
+ 'demucs_bass_secondary_model_scale',
543
+ 'demucs_drums_secondary_model_scale',
544
+ 'demucs_stems',
545
+ 'mdx_net_model',
546
+ 'chunks',
547
+ 'margin',
548
+ 'compensate',
549
+ 'is_denoise',
550
+ 'is_invert_spec',
551
+ 'mdx_batch_size',
552
+ 'mdx_voc_inst_secondary_model',
553
+ 'mdx_other_secondary_model',
554
+ 'mdx_bass_secondary_model',
555
+ 'mdx_drums_secondary_model',
556
+ 'mdx_is_secondary_model_activate',
557
+ 'mdx_voc_inst_secondary_model_scale',
558
+ 'mdx_other_secondary_model_scale',
559
+ 'mdx_bass_secondary_model_scale',
560
+ 'mdx_drums_secondary_model_scale',
561
+ 'is_save_all_outputs_ensemble',
562
+ 'is_append_ensemble_name',
563
+ 'chosen_audio_tool',
564
+ 'choose_algorithm',
565
+ 'time_stretch_rate',
566
+ 'pitch_rate',
567
+ 'is_primary_stem_only',
568
+ 'is_secondary_stem_only',
569
+ 'is_testing_audio',
570
+ 'is_add_model_name',
571
+ "is_accept_any_input",
572
+ 'is_task_complete',
573
+ 'is_create_model_folder',
574
+ 'mp3_bit_set',
575
+ 'save_format',
576
+ 'wav_type_set',
577
+ 'user_code',
578
+ 'is_gpu_conversion',
579
+ 'is_normalization',
580
+ 'help_hints_var',
581
+ 'model_sample_mode',
582
+ 'model_sample_mode_duration')
583
+
584
+ # Message Box Text
585
+
586
+ INVALID_INPUT = 'Invalid Input', 'The input is invalid.\n\nPlease verify the input still exists or is valid and try again.'
587
+ INVALID_EXPORT = 'Invalid Export Directory', 'You have selected an invalid export directory.\n\nPlease make sure the selected directory still exists.'
588
+ INVALID_ENSEMBLE = 'Not Enough Models', 'You must select 2 or more models to run ensemble.'
589
+ INVALID_MODEL = 'No Model Chosen', 'You must select an model to continue.'
590
+ MISSING_MODEL = 'Model Missing', 'The selected model is missing or not valid.'
591
+ ERROR_OCCURED = 'Error Occured', '\n\nWould you like to open the error log for more details?\n'
592
+
593
+ # GUI Text Constants
594
+
595
+ BACK_TO_MAIN_MENU = 'Back to Main Menu'
596
+
597
+ # Help Hint Text
598
+
599
+ INTERNAL_MODEL_ATT = 'Internal model attribute. \n\n ***Do not change this setting if you are unsure!***'
600
+ STOP_HELP = 'Halts any running processes. \n A pop-up window will ask the user to confirm the action.'
601
+ SETTINGS_HELP = 'Opens the main settings guide. This window includes the \"Download Center\"'
602
+ COMMAND_TEXT_HELP = 'Provides information on the progress of the current process.'
603
+ SAVE_CURRENT_SETTINGS_HELP = 'Allows the user to open any saved settings or save the current application settings.'
604
+ CHUNKS_HELP = ('For MDX-Net, all values use the same amount of resources. Using chunks is no longer recommended.\n\n' + \
605
+ '• This option is now only for output quality.\n' + \
606
+ '• Some tracks may fare better depending on the value.\n' + \
607
+ '• Some tracks may fare worse depending on the value.\n' + \
608
+ '• Larger chunk sizes use will take less time to process.\n' +\
609
+ '• Smaller chunk sizes use will take more time to process.\n')
610
+ CHUNKS_DEMUCS_HELP = ('This option allows the user to reduce (or increase) RAM or V-RAM usage.\n\n' + \
611
+ '• Smaller chunk sizes use less RAM or V-RAM but can also increase processing times.\n' + \
612
+ '• Larger chunk sizes use more RAM or V-RAM but can also reduce processing times.\n' + \
613
+ '• Selecting \"Auto\" calculates an appropriate chuck size based on how much RAM or V-RAM your system has.\n' + \
614
+ '• Selecting \"Full\" will process the track as one whole chunk. (not recommended)\n' + \
615
+ '• The default selection is \"Auto\".')
616
+ MARGIN_HELP = 'Selects the frequency margins to slice the chunks from.\n\n• The recommended margin size is 44100.\n• Other values can give unpredictable results.'
617
+ AGGRESSION_SETTING_HELP = ('This option allows you to set how strong the primary stem extraction will be.\n\n' + \
618
+ '• The range is 0-100.\n' + \
619
+ '• Higher values perform deeper extractions.\n' + \
620
+ '• The default is 10 for instrumental & vocal models.\n' + \
621
+ '• Values over 10 can result in muddy-sounding instrumentals for the non-vocal models')
622
+ WINDOW_SIZE_HELP = ('The smaller your window size, the better your conversions will be. \nHowever, a smaller window means longer conversion times and heavier resource usage.\n\n' + \
623
+ 'Breakdown of the selectable window size values:\n' + \
624
+ '• 1024 - Low conversion quality, shortest conversion time, low resource usage.\n' + \
625
+ '• 512 - Average conversion quality, average conversion time, normal resource usage.\n' + \
626
+ '• 320 - Better conversion quality.')
627
+ DEMUCS_STEMS_HELP = ('Here, you can choose which stem to extract using the selected model.\n\n' +\
628
+ 'Stem Selections:\n\n' +\
629
+ '• All Stems - Saves all of the stems the model is able to extract.\n' +\
630
+ '• Vocals - Pulls vocal stem only.\n' +\
631
+ '• Other - Pulls other stem only.\n' +\
632
+ '• Bass - Pulls bass stem only.\n' +\
633
+ '• Drums - Pulls drum stem only.\n')
634
+ SEGMENT_HELP = ('This option allows the user to reduce (or increase) RAM or V-RAM usage.\n\n' + \
635
+ '• Smaller segment sizes use less RAM or V-RAM but can also increase processing times.\n' + \
636
+ '• Larger segment sizes use more RAM or V-RAM but can also reduce processing times.\n' + \
637
+ '• Selecting \"Default\" uses the recommended segment size.\n' + \
638
+ '• It is recommended that you not use segments with \"Chunking\".')
639
+ ENSEMBLE_MAIN_STEM_HELP = 'Allows the user to select the type of stems they wish to ensemble.\n\nOptions:\n\n' +\
640
+ f'• {VOCAL_PAIR} - The primary stem will be the vocals and the secondary stem will be the the instrumental\n' +\
641
+ f'• {OTHER_PAIR} - The primary stem will be other and the secondary stem will be no other (the mixture without the \'other\' stem)\n' +\
642
+ f'• {BASS_PAIR} - The primary stem will be bass and the secondary stem will be no bass (the mixture without the \'bass\' stem)\n' +\
643
+ f'• {DRUM_PAIR} - The primary stem will be drums and the secondary stem will be no drums (the mixture without the \'drums\' stem)\n' +\
644
+ f'• {FOUR_STEM_ENSEMBLE} - This option will gather all the 4 stem Demucs models and ensemble all of the outputs.\n'
645
+ ENSEMBLE_TYPE_HELP = 'Allows the user to select the ensemble algorithm to be used to generate the final output.\n\nExample & Other Note:\n\n' +\
646
+ f'• {MAX_MIN} - If this option is chosen, the primary stem outputs will be processed through \nthe \'Max Spec\' algorithm, and the secondary stem will be processed through the \'Min Spec\' algorithm.\n' +\
647
+ f'• Only a single algorithm will be shown when the \'4 Stem Ensemble\' option is chosen.\n\nAlgorithm Details:\n\n' +\
648
+ f'• {MAX_SPEC} - This algorithm combines the final results and generates the highest possible output from them.\nFor example, if this algorithm were processing vocal stems, you would get the fullest possible \n' +\
649
+ 'result making the ensembled vocal stem sound cleaner. However, it might result in more unwanted artifacts.\n' +\
650
+ f'• {MIN_SPEC} - This algorithm combines the results and generates the lowest possible output from them.\nFor example, if this algorithm were processing instrumental stems, you would get the cleanest possible result \n' +\
651
+ 'result, eliminating more unwanted artifacts. However, the result might also sound \'muddy\' and lack a fuller sound.\n' +\
652
+ f'• {AUDIO_AVERAGE} - This algorithm simply combines the results and averages all of them together. \n'
653
+ ENSEMBLE_LISTBOX_HELP = 'List of the all the models available for the main stem pair selected.'
654
+ IS_GPU_CONVERSION_HELP = ('When checked, the application will attempt to use your GPU (if you have one).\n' +\
655
+ 'If you do not have a GPU but have this checked, the application will default to your CPU.\n\n' +\
656
+ 'Note: CPU conversions are much slower than those processed through the GPU.')
657
+ SAVE_STEM_ONLY_HELP = 'Allows the user to save only the selected stem.'
658
+ IS_NORMALIZATION_HELP = 'Normalizes output to prevent clipping.'
659
+ CROP_SIZE_HELP = '**Only compatible with select models only!**\n\n Setting should match training crop-size value. Leave as is if unsure.'
660
+ IS_TTA_HELP = ('This option performs Test-Time-Augmentation to improve the separation quality.\n\n' +\
661
+ 'Note: Having this selected will increase the time it takes to complete a conversion')
662
+ IS_POST_PROCESS_HELP = ('This option can potentially identify leftover instrumental artifacts within the vocal outputs. \nThis option may improve the separation of some songs.\n\n' +\
663
+ 'Note: Selecting this option can adversely affect the conversion process, depending on the track. Because of this, it is only recommended as a last resort.')
664
+ IS_HIGH_END_PROCESS_HELP = 'The application will mirror the missing frequency range of the output.'
665
+ SHIFTS_HELP = ('Performs multiple predictions with random shifts of the input and averages them.\n\n' +\
666
+ '• The higher number of shifts, the longer the prediction will take. \n- Not recommended unless you have a GPU.')
667
+ OVERLAP_HELP = 'This option controls the amount of overlap between prediction windows (for Demucs one window is 10 seconds)'
668
+ IS_CHUNK_DEMUCS_HELP = '• Enables \"Chunks\".\n• We recommend you not enable this option with \"Split Mode\" enabled or with the Demucs v4 Models.'
669
+ IS_CHUNK_MDX_NET_HELP = '• Enables \"Chunks\".\n• Using this option for MDX-Net no longer effects RAM usage.\n• Having this enabled will effect output quality, for better or worse depending on the set value.'
670
+ IS_SPLIT_MODE_HELP = ('• Enables \"Segments\". \n• We recommend you not enable this option with \"Enable Chunks\".\n' +\
671
+ '• Deselecting this option is only recommended for those with powerful PCs or if using \"Chunk\" mode instead.')
672
+ IS_DEMUCS_COMBINE_STEMS_HELP = 'The application will create the secondary stem by combining the remaining stems \ninstead of inverting the primary stem with the mixture.'
673
+ COMPENSATE_HELP = 'Compensates the audio of the primary stems to allow for a better secondary stem.'
674
+ IS_DENOISE_HELP = '• This option removes a majority of the noise generated by the MDX-Net models.\n• The conversion will take nearly twice as long with this enabled.'
675
+ CLEAR_CACHE_HELP = 'Clears any user selected model settings for previously unrecognized models.'
676
+ IS_SAVE_ALL_OUTPUTS_ENSEMBLE_HELP = 'Enabling this option will keep all indivudual outputs generated by an ensemble.'
677
+ IS_APPEND_ENSEMBLE_NAME_HELP = 'The application will append the ensemble name to the final output \nwhen this option is enabled.'
678
+ DONATE_HELP = 'Takes the user to an external web-site to donate to this project!'
679
+ IS_INVERT_SPEC_HELP = '• This option may produce a better secondary stem.\n• Inverts primary stem with mixture using spectragrams instead of wavforms.\n• This inversion method is slightly slower.'
680
+ IS_MIXER_MODE_HELP = '• This option may improve separations for outputs from 4-stem models.\n• Might produce more noise.\n• This option might slow down separation time.'
681
+ IS_TESTING_AUDIO_HELP = 'Appends a unique 10 digit number to output files so the user \ncan compare results with different settings.'
682
+ IS_MODEL_TESTING_AUDIO_HELP = 'Appends the model name to output files so the user \ncan compare results with different settings.'
683
+ IS_ACCEPT_ANY_INPUT_HELP = 'The application will accept any input when enabled, even if it does not have an audio format extension.\n\nThis is for experimental purposes, and having it enabled is not recommended.'
684
+ IS_TASK_COMPLETE_HELP = 'When enabled, chimes will be heard when a process completes or fails.'
685
+ IS_CREATE_MODEL_FOLDER_HELP = 'Two new directories will be generated for the outputs in \nthe export directory after each conversion.\n\n' +\
686
+ '• First directory - Named after the model.\n' +\
687
+ '• Second directory - Named after the track.\n\n' +\
688
+ '• Example: \n\n' +\
689
+ '─ Export Directory\n' +\
690
+ ' └── First Directory\n' +\
691
+ ' └── Second Directory\n' +\
692
+ ' └── Output File(s)'
693
+ DELETE_YOUR_SETTINGS_HELP = 'This menu contains your saved settings. You will be asked to \nconfirm if you wish to delete the selected setting.'
694
+ SET_STEM_NAME_HELP = 'Choose the primary stem for the selected model.'
695
+ MDX_DIM_T_SET_HELP = INTERNAL_MODEL_ATT
696
+ MDX_DIM_F_SET_HELP = INTERNAL_MODEL_ATT
697
+ MDX_N_FFT_SCALE_SET_HELP = 'Set the N_FFT size the model was trained with.'
698
+ POPUP_COMPENSATE_HELP = f'Choose the appropriate voluem compensattion for the selected model\n\nReminder: {COMPENSATE_HELP}'
699
+ VR_MODEL_PARAM_HELP = 'Choose the parameters needed to run the selected model.'
700
+ CHOSEN_ENSEMBLE_HELP = 'Select saved enselble or save current ensemble.\n\nDefault Selections:\n\n• Save the current ensemble.\n• Clears all current model selections.'
701
+ CHOSEN_PROCESS_METHOD_HELP = 'Here, you choose between different Al networks and algorithms to process your track.\n\n' +\
702
+ 'There are five options:\n\n' +\
703
+ '• VR Architecture - These models use magnitude spectrograms for Source Separation.\n' +\
704
+ '• MDX-Net - These models use Hybrid Spectrogram/Waveform for Source Separation.\n' +\
705
+ '• Demucs v3 - These models use Hybrid Spectrogram/Waveform for Source Separation.\n' +\
706
+ '• Ensemble Mode - Here, you can get the best results from multiple models and networks.\n' +\
707
+ '• Audio Tools - These are additional tools for added convenience.'
708
+ INPUT_FOLDER_ENTRY_HELP = 'Select Input:\n\nHere is where you select the audio files(s) you wish to process.'
709
+ INPUT_FOLDER_ENTRY_HELP_2 = 'Input Option Menu:\n\nClick here to access the input option menu.'
710
+ OUTPUT_FOLDER_ENTRY_HELP = 'Select Output:\n\nHere is where you select the directory where your processed files are to be saved.'
711
+ INPUT_FOLDER_BUTTON_HELP = 'Open Input Folder Button: \n\nOpens the directory containing the selected input audio file(s).'
712
+ OUTPUT_FOLDER_BUTTON_HELP = 'Open Output Folder Button: \n\nOpens the selected output folder.'
713
+ CHOOSE_MODEL_HELP = 'Each process method comes with its own set of options and models.\n\nHere is where you choose the model associated with the selected process method.'
714
+ FORMAT_SETTING_HELP = 'Save outputs as '
715
+ SECONDARY_MODEL_ACTIVATE_HELP = 'When enabled, the application will run an additional inference with the selected model(s) above.'
716
+ SECONDARY_MODEL_HELP = 'Choose the secondary model associated with this stem you wish to run with the current process method.'
717
+ SECONDARY_MODEL_SCALE_HELP = 'The scale determines how the final audio outputs will be averaged between the primary and secondary models.\n\nFor example:\n\n' +\
718
+ '• 10% - 10 percent of the main model result will be factored into the final result.\n' +\
719
+ '• 50% - The results from the main and secondary models will be averaged evenly.\n' +\
720
+ '• 90% - 90 percent of the main model result will be factored into the final result.'
721
+ PRE_PROC_MODEL_ACTIVATE_HELP = 'The application will run an inference with the selected model above, pulling only the instrumental stem when enabled. \nFrom there, all of the non-vocal stems will be pulled from the generated instrumental.\n\nNotes:\n\n' +\
722
+ '• This option can significantly reduce vocal bleed within the non-vocal stems.\n' +\
723
+ '• It is only available in Demucs.\n' +\
724
+ '• It is only compatible with non-vocal and non-instrumental stem outputs.\n' +\
725
+ '• This will increase thetotal processing time.\n' +\
726
+ '• Only VR and MDX-Net Vocal or Instrumental models are selectable above.'
727
+
728
+ AUDIO_TOOLS_HELP = 'Here, you choose between different audio tools to process your track.\n\n' +\
729
+ '• Manual Ensemble - You must have 2 or more files selected as your inputs. Allows the user to run their tracks through \nthe same algorithms used in Ensemble Mode.\n' +\
730
+ '• Align Inputs - You must have exactly 2 files selected as your inputs. The second input will be aligned with the first input.\n' +\
731
+ '• Time Stretch - The user can speed up or slow down the selected inputs.\n' +\
732
+ '• Change Pitch - The user can change the pitch for the selected inputs.\n'
733
+ PRE_PROC_MODEL_INST_MIX_HELP = 'When enabled, the application will generate a third output without the selected stem and vocals.'
734
+ MODEL_SAMPLE_MODE_HELP = 'Allows the user to process only part of a track to sample settings or a model without \nrunning a full conversion.\n\nNotes:\n\n' +\
735
+ '• The number in the parentheses is the current number of seconds the generated sample will be.\n' +\
736
+ '• You can choose the number of seconds to extract from the track in the \"Additional Settings\" menu.'
737
+
738
+ POST_PROCESS_THREASHOLD_HELP = 'Allows the user to control the intensity of the Post_process option.\n\nNotes:\n\n' +\
739
+ '• Higher values potentially remove more artifacts. However, bleed might increase.\n' +\
740
+ '• Lower values limit artifact removal.'
741
+
742
+ BATCH_SIZE_HELP = 'Specify the number of batches to be processed at a time.\n\nNotes:\n\n' +\
743
+ '• Higher values mean more RAM usage but slightly faster processing times.\n' +\
744
+ '• Lower values mean less RAM usage but slightly longer processing times.\n' +\
745
+ '• Batch size value has no effect on output quality.'
746
+
747
+ # Warning Messages
748
+
749
+ STORAGE_ERROR = 'Insufficient Storage', 'There is not enough storage on main drive to continue. Your main drive must have at least 3 GB\'s of storage in order for this application function properly. \n\nPlease ensure your main drive has at least 3 GB\'s of storage and try again.\n\n'
750
+ STORAGE_WARNING = 'Available Storage Low', 'Your main drive is running low on storage. Your main drive must have at least 3 GB\'s of storage in order for this application function properly.\n\n'
751
+ CONFIRM_WARNING = '\nAre you sure you wish to continue?'
752
+ PROCESS_FAILED = 'Process failed, please see error log\n'
753
+ EXIT_PROCESS_ERROR = 'Active Process', 'Please stop the active process or wait for it to complete before you exit.'
754
+ EXIT_HALTED_PROCESS_ERROR = 'Halting Process', 'Please wait for the application to finish halting the process before exiting.'
755
+ EXIT_DOWNLOAD_ERROR = 'Active Download', 'Please stop the download or wait for it to complete before you exit.'
756
+ SET_TO_DEFAULT_PROCESS_ERROR = 'Active Process', 'You cannot reset all of the application settings during an active process.'
757
+ SET_TO_ANY_PROCESS_ERROR = 'Active Process', 'You cannot reset the application settings during an active process.'
758
+ RESET_ALL_TO_DEFAULT_WARNING = 'Reset Settings Confirmation', 'All application settings will be set to factory default.\n\nAre you sure you wish to continue?'
759
+ AUDIO_VERIFICATION_CHECK = lambda i, e:f'++++++++++++++++++++++++++++++++++++++++++++++++++++\n\nBroken File Removed: \n\n{i}\n\nError Details:\n\n{e}\n++++++++++++++++++++++++++++++++++++++++++++++++++++'
760
+ INVALID_ONNX_MODEL_ERROR = 'Invalid Model', 'The file selected is not a valid MDX-Net model. Please see the error log for more information.'
761
+
762
+
763
+ # Separation Text
764
+
765
+ LOADING_MODEL = 'Loading model...'
766
+ INFERENCE_STEP_1 = 'Running inference...'
767
+ INFERENCE_STEP_1_SEC = 'Running inference (secondary model)...'
768
+ INFERENCE_STEP_1_4_STEM = lambda stem:f'Running inference (secondary model for {stem})...'
769
+ INFERENCE_STEP_1_PRE = 'Running inference (pre-process model)...'
770
+ INFERENCE_STEP_2_PRE = lambda pm, m:f'Loading pre-process model ({pm}: {m})...'
771
+ INFERENCE_STEP_2_SEC = lambda pm, m:f'Loading secondary model ({pm}: {m})...'
772
+ INFERENCE_STEP_2_SEC_CACHED_MODOEL = lambda pm, m:f'Secondary model ({pm}: {m}) cache loaded.\n'
773
+ INFERENCE_STEP_2_PRE_CACHED_MODOEL = lambda pm, m:f'Pre-process model ({pm}: {m}) cache loaded.\n'
774
+ INFERENCE_STEP_2_SEC_CACHED = 'Loading cached secondary model source(s)... Done!\n'
775
+ INFERENCE_STEP_2_PRIMARY_CACHED = 'Model cache loaded.\n'
776
+ INFERENCE_STEP_2 = 'Inference complete.'
777
+ SAVING_STEM = 'Saving ', ' stem...'
778
+ SAVING_ALL_STEMS = 'Saving all stems...'
779
+ ENSEMBLING_OUTPUTS = 'Ensembling outputs...'
780
+ DONE = ' Done!\n'
781
+ ENSEMBLES_SAVED = 'Ensembled outputs saved!\n\n'
782
+ NEW_LINES = "\n\n"
783
+ NEW_LINE = "\n"
784
+ NO_LINE = ''
785
+
786
+ # Widget Placements
787
+
788
+ MAIN_ROW_Y = -15, -17
789
+ MAIN_ROW_X = -4, 21
790
+ MAIN_ROW_WIDTH = -53
791
+ MAIN_ROW_2_Y = -15, -17
792
+ MAIN_ROW_2_X = -28, 1
793
+ CHECK_BOX_Y = 0
794
+ CHECK_BOX_X = 20
795
+ CHECK_BOX_WIDTH = -50
796
+ CHECK_BOX_HEIGHT = 2
797
+ LEFT_ROW_WIDTH = -10
798
+ LABEL_HEIGHT = -5
799
+ OPTION_HEIGHT = 7
800
+ LOW_MENU_Y = 18, 16
801
+ FFMPEG_EXT = (".aac", ".aiff", ".alac" ,".flac", ".FLAC", ".mov", ".mp4", ".MP4",
802
+ ".m4a", ".M4A", ".mp2", ".mp3", "MP3", ".mpc", ".mpc8",
803
+ ".mpeg", ".ogg", ".OGG", ".tta", ".wav", ".wave", ".WAV", ".WAVE", ".wma", ".webm", ".eac3", ".mkv")
804
+
805
+ FFMPEG_MORE_EXT = (".aa", ".aac", ".ac3", ".aiff", ".alac", ".avi", ".f4v",".flac", ".flic", ".flv",
806
+ ".m4v",".mlv", ".mov", ".mp4", ".m4a", ".mp2", ".mp3", ".mp4", ".mpc", ".mpc8",
807
+ ".mpeg", ".ogg", ".tta", ".tty", ".vcd", ".wav", ".wma")
808
+ ANY_EXT = ""
809
+
810
+ # Secondary Menu Constants
811
+
812
+ VOCAL_PAIR_PLACEMENT = 1, 2, 3, 4
813
+ OTHER_PAIR_PLACEMENT = 5, 6, 7, 8
814
+ BASS_PAIR_PLACEMENT = 9, 10, 11, 12
815
+ DRUMS_PAIR_PLACEMENT = 13, 14, 15, 16
816
+
817
+ # Drag n Drop String Checks
818
+
819
+ DOUBLE_BRACKET = "} {"
820
+ RIGHT_BRACKET = "}"
821
+ LEFT_BRACKET = "{"
822
+
823
+ # Manual Downloads
824
+
825
+ VR_PLACEMENT_TEXT = 'Place models in \"models/VR_Models\" directory.'
826
+ MDX_PLACEMENT_TEXT = 'Place models in \"models/MDX_Net_Models\" directory.'
827
+ DEMUCS_PLACEMENT_TEXT = 'Place models in \"models/Demucs_Models\" directory.'
828
+ DEMUCS_V3_V4_PLACEMENT_TEXT = 'Place items in \"models/Demucs_Models/v3_v4_repo\" directory.'
829
+
830
+ FULL_DOWNLOAD_LIST_VR = {
831
+ "VR Arch Single Model v5: 1_HP-UVR": "1_HP-UVR.pth",
832
+ "VR Arch Single Model v5: 2_HP-UVR": "2_HP-UVR.pth",
833
+ "VR Arch Single Model v5: 3_HP-Vocal-UVR": "3_HP-Vocal-UVR.pth",
834
+ "VR Arch Single Model v5: 4_HP-Vocal-UVR": "4_HP-Vocal-UVR.pth",
835
+ "VR Arch Single Model v5: 5_HP-Karaoke-UVR": "5_HP-Karaoke-UVR.pth",
836
+ "VR Arch Single Model v5: 6_HP-Karaoke-UVR": "6_HP-Karaoke-UVR.pth",
837
+ "VR Arch Single Model v5: 7_HP2-UVR": "7_HP2-UVR.pth",
838
+ "VR Arch Single Model v5: 8_HP2-UVR": "8_HP2-UVR.pth",
839
+ "VR Arch Single Model v5: 9_HP2-UVR": "9_HP2-UVR.pth",
840
+ "VR Arch Single Model v5: 10_SP-UVR-2B-32000-1": "10_SP-UVR-2B-32000-1.pth",
841
+ "VR Arch Single Model v5: 11_SP-UVR-2B-32000-2": "11_SP-UVR-2B-32000-2.pth",
842
+ "VR Arch Single Model v5: 12_SP-UVR-3B-44100": "12_SP-UVR-3B-44100.pth",
843
+ "VR Arch Single Model v5: 13_SP-UVR-4B-44100-1": "13_SP-UVR-4B-44100-1.pth",
844
+ "VR Arch Single Model v5: 14_SP-UVR-4B-44100-2": "14_SP-UVR-4B-44100-2.pth",
845
+ "VR Arch Single Model v5: 15_SP-UVR-MID-44100-1": "15_SP-UVR-MID-44100-1.pth",
846
+ "VR Arch Single Model v5: 16_SP-UVR-MID-44100-2": "16_SP-UVR-MID-44100-2.pth",
847
+ "VR Arch Single Model v4: MGM_HIGHEND_v4": "MGM_HIGHEND_v4.pth",
848
+ "VR Arch Single Model v4: MGM_LOWEND_A_v4": "MGM_LOWEND_A_v4.pth",
849
+ "VR Arch Single Model v4: MGM_LOWEND_B_v4": "MGM_LOWEND_B_v4.pth",
850
+ "VR Arch Single Model v4: MGM_MAIN_v4": "MGM_MAIN_v4.pth"
851
+ }
852
+
853
+ FULL_DOWNLOAD_LIST_MDX = {
854
+ "MDX-Net Model: UVR-MDX-NET Main": "UVR_MDXNET_Main.onnx",
855
+ "MDX-Net Model: UVR-MDX-NET Inst Main": "UVR-MDX-NET-Inst_Main.onnx",
856
+ "MDX-Net Model: UVR-MDX-NET 1": "UVR_MDXNET_1_9703.onnx",
857
+ "MDX-Net Model: UVR-MDX-NET 2": "UVR_MDXNET_2_9682.onnx",
858
+ "MDX-Net Model: UVR-MDX-NET 3": "UVR_MDXNET_3_9662.onnx",
859
+ "MDX-Net Model: UVR-MDX-NET Inst 1": "UVR-MDX-NET-Inst_1.onnx",
860
+ "MDX-Net Model: UVR-MDX-NET Inst 2": "UVR-MDX-NET-Inst_2.onnx",
861
+ "MDX-Net Model: UVR-MDX-NET Inst 3": "UVR-MDX-NET-Inst_3.onnx",
862
+ "MDX-Net Model: UVR-MDX-NET Karaoke": "UVR_MDXNET_KARA.onnx",
863
+ "MDX-Net Model: UVR_MDXNET_9482": "UVR_MDXNET_9482.onnx",
864
+ "MDX-Net Model: Kim_Vocal_1": "Kim_Vocal_1.onnx",
865
+ "MDX-Net Model: kuielab_a_vocals": "kuielab_a_vocals.onnx",
866
+ "MDX-Net Model: kuielab_a_other": "kuielab_a_other.onnx",
867
+ "MDX-Net Model: kuielab_a_bass": "kuielab_a_bass.onnx",
868
+ "MDX-Net Model: kuielab_a_drums": "kuielab_a_drums.onnx",
869
+ "MDX-Net Model: kuielab_b_vocals": "kuielab_b_vocals.onnx",
870
+ "MDX-Net Model: kuielab_b_other": "kuielab_b_other.onnx",
871
+ "MDX-Net Model: kuielab_b_bass": "kuielab_b_bass.onnx",
872
+ "MDX-Net Model: kuielab_b_drums": "kuielab_b_drums.onnx"}
873
+
874
+ FULL_DOWNLOAD_LIST_DEMUCS = {
875
+
876
+ "Demucs v4: htdemucs_ft":{
877
+ "f7e0c4bc-ba3fe64a.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/f7e0c4bc-ba3fe64a.th",
878
+ "d12395a8-e57c48e6.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/d12395a8-e57c48e6.th",
879
+ "92cfc3b6-ef3bcb9c.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/92cfc3b6-ef3bcb9c.th",
880
+ "04573f0d-f3cf25b2.th":"https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/04573f0d-f3cf25b2.th",
881
+ "htdemucs_ft.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs_ft.yaml"
882
+ },
883
+
884
+ "Demucs v4: htdemucs":{
885
+ "955717e8-8726e21a.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/955717e8-8726e21a.th",
886
+ "htdemucs.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs.yaml"
887
+ },
888
+
889
+ "Demucs v4: hdemucs_mmi":{
890
+ "75fc33f5-1941ce65.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/75fc33f5-1941ce65.th",
891
+ "hdemucs_mmi.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/hdemucs_mmi.yaml"
892
+ },
893
+ "Demucs v4: htdemucs_6s":{
894
+ "5c90dfd2-34c22ccb.th": "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/5c90dfd2-34c22ccb.th",
895
+ "htdemucs_6s.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/htdemucs_6s.yaml"
896
+ },
897
+ "Demucs v3: mdx":{
898
+ "0d19c1c6-0f06f20e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/0d19c1c6-0f06f20e.th",
899
+ "7ecf8ec1-70f50cc9.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7ecf8ec1-70f50cc9.th",
900
+ "c511e2ab-fe698775.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/c511e2ab-fe698775.th",
901
+ "7d865c68-3d5dd56b.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7d865c68-3d5dd56b.th",
902
+ "mdx.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/mdx.yaml"
903
+ },
904
+
905
+ "Demucs v3: mdx_q":{
906
+ "6b9c2ca1-3fd82607.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/6b9c2ca1-3fd82607.th",
907
+ "b72baf4e-8778635e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/b72baf4e-8778635e.th",
908
+ "42e558d4-196e0e1b.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/42e558d4-196e0e1b.th",
909
+ "305bc58f-18378783.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/305bc58f-18378783.th",
910
+ "mdx_q.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/mdx_q.yaml"
911
+ },
912
+
913
+ "Demucs v3: mdx_extra":{
914
+ "e51eebcc-c1b80bdd.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/e51eebcc-c1b80bdd.th",
915
+ "a1d90b5c-ae9d2452.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/a1d90b5c-ae9d2452.th",
916
+ "5d2d6c55-db83574e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/5d2d6c55-db83574e.th",
917
+ "cfa93e08-61801ae1.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/cfa93e08-61801ae1.th",
918
+ "mdx_extra.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/mdx_extra.yaml"
919
+ },
920
+
921
+ "Demucs v3: mdx_extra_q": {
922
+ "83fc094f-4a16d450.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/83fc094f-4a16d450.th",
923
+ "464b36d7-e5a9386e.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/464b36d7-e5a9386e.th",
924
+ "14fc6a69-a89dd0ee.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/14fc6a69-a89dd0ee.th",
925
+ "7fd6ef75-a905dd85.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/7fd6ef75-a905dd85.th",
926
+ "mdx_extra_q.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/mdx_extra_q.yaml"
927
+ },
928
+
929
+ "Demucs v3: UVR Model":{
930
+ "ebf34a2db.th": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/ebf34a2db.th",
931
+ "UVR_Demucs_Model_1.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/UVR_Demucs_Model_1.yaml"
932
+ },
933
+
934
+ "Demucs v3: repro_mdx_a":{
935
+ "9a6b4851-03af0aa6.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/9a6b4851-03af0aa6.th",
936
+ "1ef250f1-592467ce.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/1ef250f1-592467ce.th",
937
+ "fa0cb7f9-100d8bf4.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/fa0cb7f9-100d8bf4.th",
938
+ "902315c2-b39ce9c9.th": "https://dl.fbaipublicfiles.com/demucs/mdx_final/902315c2-b39ce9c9.th",
939
+ "repro_mdx_a.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a.yaml"
940
+ },
941
+
942
+ "Demucs v3: repro_mdx_a_time_only":{
943
+ "9a6b4851-03af0aa6.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/9a6b4851-03af0aa6.th",
944
+ "1ef250f1-592467ce.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/1ef250f1-592467ce.th",
945
+ "repro_mdx_a_time_only.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a_time_only.yaml"
946
+ },
947
+
948
+ "Demucs v3: repro_mdx_a_hybrid_only":{
949
+ "fa0cb7f9-100d8bf4.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/fa0cb7f9-100d8bf4.th",
950
+ "902315c2-b39ce9c9.th":"https://dl.fbaipublicfiles.com/demucs/mdx_final/902315c2-b39ce9c9.th",
951
+ "repro_mdx_a_hybrid_only.yaml": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/repro_mdx_a_hybrid_only.yaml"
952
+ },
953
+
954
+ "Demucs v2: demucs": {
955
+ "demucs-e07c671f.th": "https://dl.fbaipublicfiles.com/demucs/v3.0/demucs-e07c671f.th"
956
+ },
957
+
958
+ "Demucs v2: demucs_extra": {
959
+ "demucs_extra-3646af93.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs_extra-3646af93.th"
960
+ },
961
+
962
+ "Demucs v2: demucs48_hq": {
963
+ "demucs48_hq-28a1282c.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs48_hq-28a1282c.th"
964
+ },
965
+
966
+ "Demucs v2: tasnet": {
967
+ "tasnet-beb46fac.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/tasnet-beb46fac.th"
968
+ },
969
+
970
+ "Demucs v2: tasnet_extra": {
971
+ "tasnet_extra-df3777b2.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/tasnet_extra-df3777b2.th"
972
+ },
973
+
974
+ "Demucs v2: demucs_unittest": {
975
+ "demucs_unittest-09ebc15f.th":"https://dl.fbaipublicfiles.com/demucs/v3.0/demucs_unittest-09ebc15f.th"
976
+ },
977
+
978
+ "Demucs v1: demucs": {
979
+ "demucs.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/demucs.th"
980
+ },
981
+
982
+ "Demucs v1: demucs_extra": {
983
+ "demucs_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/demucs_extra.th"
984
+ },
985
+
986
+ "Demucs v1: light": {
987
+ "light.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/light.th"
988
+ },
989
+
990
+ "Demucs v1: light_extra": {
991
+ "light_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/light_extra.th"
992
+ },
993
+
994
+ "Demucs v1: tasnet": {
995
+ "tasnet.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/tasnet.th"
996
+ },
997
+
998
+ "Demucs v1: tasnet_extra": {
999
+ "tasnet_extra.th":"https://dl.fbaipublicfiles.com/demucs/v2.0/tasnet_extra.th"
1000
+ }
1001
+ }
1002
+
1003
+ # Main Menu Labels
1004
+
1005
+ CHOOSE_PROC_METHOD_MAIN_LABEL = 'CHOOSE PROCESS METHOD'
1006
+ SELECT_SAVED_SETTINGS_MAIN_LABEL = 'SELECT SAVED SETTINGS'
1007
+ CHOOSE_MDX_MODEL_MAIN_LABEL = 'CHOOSE MDX-NET MODEL'
1008
+ BATCHES_MDX_MAIN_LABEL = 'BATCH SIZE'
1009
+ VOL_COMP_MDX_MAIN_LABEL = 'VOLUME COMPENSATION'
1010
+ SELECT_VR_MODEL_MAIN_LABEL = 'CHOOSE VR MODEL'
1011
+ AGGRESSION_SETTING_MAIN_LABEL = 'AGGRESSION SETTING'
1012
+ WINDOW_SIZE_MAIN_LABEL = 'WINDOW SIZE'
1013
+ CHOOSE_DEMUCS_MODEL_MAIN_LABEL = 'CHOOSE DEMUCS MODEL'
1014
+ CHOOSE_DEMUCS_STEMS_MAIN_LABEL = 'CHOOSE STEM(S)'
1015
+ CHOOSE_SEGMENT_MAIN_LABEL = 'SEGMENT'
1016
+ ENSEMBLE_OPTIONS_MAIN_LABEL = 'ENSEMBLE OPTIONS'
1017
+ CHOOSE_MAIN_PAIR_MAIN_LABEL = 'MAIN STEM PAIR'
1018
+ CHOOSE_ENSEMBLE_ALGORITHM_MAIN_LABEL = 'ENSEMBLE ALGORITHM'
1019
+ AVAILABLE_MODELS_MAIN_LABEL = 'AVAILABLE MODELS'
1020
+ CHOOSE_AUDIO_TOOLS_MAIN_LABEL = 'CHOOSE AUDIO TOOL'
1021
+ CHOOSE_MANUAL_ALGORITHM_MAIN_LABEL = 'CHOOSE ALGORITHM'
1022
+ CHOOSE_RATE_MAIN_LABEL = 'RATE'
1023
+ CHOOSE_SEMITONES_MAIN_LABEL = 'SEMITONES'
1024
+ GPU_CONVERSION_MAIN_LABEL = 'GPU Conversion'
1025
+
1026
+ if OPERATING_SYSTEM=="Darwin":
1027
+ LICENSE_OS_SPECIFIC_TEXT = '• This application is intended for those running macOS Catalina and above.\n' +\
1028
+ '• Application functionality for systems running macOS Mojave or lower is not guaranteed.\n' +\
1029
+ '• Application functionality for older or budget Mac systems is not guaranteed.\n\n'
1030
+ FONT_SIZE_F1 = 13
1031
+ FONT_SIZE_F2 = 11
1032
+ FONT_SIZE_F3 = 12
1033
+ FONT_SIZE_0 = 9
1034
+ FONT_SIZE_1 = 11
1035
+ FONT_SIZE_2 = 12
1036
+ FONT_SIZE_3 = 13
1037
+ FONT_SIZE_4 = 14
1038
+ FONT_SIZE_5 = 15
1039
+ FONT_SIZE_6 = 17
1040
+ HELP_HINT_CHECKBOX_WIDTH = 13
1041
+ MDX_CHECKBOXS_WIDTH = 14
1042
+ VR_CHECKBOXS_WIDTH = 14
1043
+ ENSEMBLE_CHECKBOXS_WIDTH = 18
1044
+ DEMUCS_CHECKBOXS_WIDTH = 14
1045
+ DEMUCS_PRE_CHECKBOXS_WIDTH = 20
1046
+ GEN_SETTINGS_WIDTH = 17
1047
+ MENU_COMBOBOX_WIDTH = 16
1048
+
1049
+ elif OPERATING_SYSTEM=="Linux":
1050
+ LICENSE_OS_SPECIFIC_TEXT = '• This application is intended for those running Linux Ubuntu 18.04+.\n' +\
1051
+ '• Application functionality for systems running other Linux platforms is not guaranteed.\n' +\
1052
+ '• Application functionality for older or budget systems is not guaranteed.\n\n'
1053
+ FONT_SIZE_F1 = 10
1054
+ FONT_SIZE_F2 = 8
1055
+ FONT_SIZE_F3 = 9
1056
+ FONT_SIZE_0 = 7
1057
+ FONT_SIZE_1 = 8
1058
+ FONT_SIZE_2 = 9
1059
+ FONT_SIZE_3 = 10
1060
+ FONT_SIZE_4 = 11
1061
+ FONT_SIZE_5 = 12
1062
+ FONT_SIZE_6 = 15
1063
+ HELP_HINT_CHECKBOX_WIDTH = 13
1064
+ MDX_CHECKBOXS_WIDTH = 14
1065
+ VR_CHECKBOXS_WIDTH = 16
1066
+ ENSEMBLE_CHECKBOXS_WIDTH = 25
1067
+ DEMUCS_CHECKBOXS_WIDTH = 18
1068
+ DEMUCS_PRE_CHECKBOXS_WIDTH = 27
1069
+ GEN_SETTINGS_WIDTH = 17
1070
+ MENU_COMBOBOX_WIDTH = 19
1071
+
1072
+ elif OPERATING_SYSTEM=="Windows":
1073
+ LICENSE_OS_SPECIFIC_TEXT = '• This application is intended for those running Windows 10 or higher.\n' +\
1074
+ '• Application functionality for systems running Windows 7 or lower is not guaranteed.\n' +\
1075
+ '• Application functionality for Intel Pentium & Celeron CPUs systems is not guaranteed.\n\n'
1076
+ FONT_SIZE_F1 = 10
1077
+ FONT_SIZE_F2 = 8
1078
+ FONT_SIZE_F3 = 9
1079
+ FONT_SIZE_0 = 7
1080
+ FONT_SIZE_1 = 8
1081
+ FONT_SIZE_2 = 9
1082
+ FONT_SIZE_3 = 10
1083
+ FONT_SIZE_4 = 11
1084
+ FONT_SIZE_5 = 12
1085
+ FONT_SIZE_6 = 15
1086
+ HELP_HINT_CHECKBOX_WIDTH = 16
1087
+ MDX_CHECKBOXS_WIDTH = 16
1088
+ VR_CHECKBOXS_WIDTH = 16
1089
+ ENSEMBLE_CHECKBOXS_WIDTH = 25
1090
+ DEMUCS_CHECKBOXS_WIDTH = 18
1091
+ DEMUCS_PRE_CHECKBOXS_WIDTH = 27
1092
+ GEN_SETTINGS_WIDTH = 23
1093
+ MENU_COMBOBOX_WIDTH = 19
1094
+
1095
+
1096
+ LICENSE_TEXT = lambda a, p:f'Current Application Version: Ultimate Vocal Remover {a}\n' +\
1097
+ f'Current Patch Version: {p}\n\n' +\
1098
+ 'Copyright (c) 2022 Ultimate Vocal Remover\n\n' +\
1099
+ 'UVR is free and open-source, but MIT licensed. Please credit us if you use our\n' +\
1100
+ f'models or code for projects unrelated to UVR.\n\n{LICENSE_OS_SPECIFIC_TEXT}' +\
1101
+ 'This bundle contains the UVR interface, Python, PyTorch, and other\n' +\
1102
+ 'dependencies needed to run the application effectively.\n\n' +\
1103
+ 'Website Links: This application, System or Service(s) may contain links to\n' +\
1104
+ 'other websites and downloads, and they are solely provided to you as an\n' +\
1105
+ 'additional convenience. You understand and acknowledge that by clicking\n' +\
1106
+ 'or activating such links you are accessing a site or service outside of\n' +\
1107
+ 'this application, and that we do not screen, review, approve, or otherwise\n' +\
1108
+ 'endorse any content or information contained in these linked websites.\n' +\
1109
+ 'You acknowledge and agree that we, our affiliates and partners are not\n' +\
1110
+ 'responsible for the contents of any of these linked websites, including\n' +\
1111
+ 'the accuracy or availability of information provided by the linked websites,\n' +\
1112
+ 'and we make no representations or warranties regarding your use of\n' +\
1113
+ 'the linked websites.\n\n' +\
1114
+ 'This application is MIT Licensed\n\n' +\
1115
+ 'Permission is hereby granted, free of charge, to any person obtaining a copy\n' +\
1116
+ 'of this software and associated documentation files (the "Software"), to deal\n' +\
1117
+ 'in the Software without restriction, including without limitation the rights\n' +\
1118
+ 'to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n' +\
1119
+ 'copies of the Software, and to permit persons to whom the Software is\n' +\
1120
+ 'furnished to do so, subject to the following conditions:\n\n' +\
1121
+ 'The above copyright notice and this permission notice shall be included in all\n' +\
1122
+ 'copies or substantial portions of the Software.\n\n' +\
1123
+ 'THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n' +\
1124
+ 'IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n' +\
1125
+ 'FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n' +\
1126
+ 'AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n' +\
1127
+ 'LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n' +\
1128
+ 'OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n' +\
1129
+ 'SOFTWARE.'
1130
+
1131
+ CHANGE_LOG_HEADER = lambda patch:f"Patch Version:\n\n{patch}"
1132
+
1133
+ #DND CONSTS
1134
+
1135
+ MAC_DND_CHECK = ('/Users/',
1136
+ '/Applications/',
1137
+ '/Library/',
1138
+ '/System/')
1139
+ LINUX_DND_CHECK = ('/home/',
1140
+ '/usr/')
1141
+ WINDOWS_DND_CHECK = ('A:', 'B:', 'C:', 'D:', 'E:', 'F:', 'G:', 'H:', 'I:', 'J:', 'K:', 'L:', 'M:', 'N:', 'O:', 'P:', 'Q:', 'R:', 'S:', 'T:', 'U:', 'V:', 'W:', 'X:', 'Y:', 'Z:')
1142
+
1143
+ WOOD_INST_MODEL_HASH = '0ec76fd9e65f81d8b4fbd13af4826ed8'
1144
+ WOOD_INST_PARAMS = {
1145
+ "vr_model_param": "4band_v3",
1146
+ "primary_stem": NO_WIND_INST_STEM
1147
+ }
uvr5/lib_v5/mdxnet.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABCMeta
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from pytorch_lightning import LightningModule
6
+ from .modules import TFC_TDF
7
+
8
+ dim_s = 4
9
+
10
+ class AbstractMDXNet(LightningModule):
11
+ __metaclass__ = ABCMeta
12
+
13
+ def __init__(self, target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length, overlap):
14
+ super().__init__()
15
+ self.target_name = target_name
16
+ self.lr = lr
17
+ self.optimizer = optimizer
18
+ self.dim_c = dim_c
19
+ self.dim_f = dim_f
20
+ self.dim_t = dim_t
21
+ self.n_fft = n_fft
22
+ self.n_bins = n_fft // 2 + 1
23
+ self.hop_length = hop_length
24
+ self.window = nn.Parameter(torch.hann_window(window_length=self.n_fft, periodic=True), requires_grad=False)
25
+ self.freq_pad = nn.Parameter(torch.zeros([1, dim_c, self.n_bins - self.dim_f, self.dim_t]), requires_grad=False)
26
+
27
+ def configure_optimizers(self):
28
+ if self.optimizer == 'rmsprop':
29
+ return torch.optim.RMSprop(self.parameters(), self.lr)
30
+
31
+ if self.optimizer == 'adamw':
32
+ return torch.optim.AdamW(self.parameters(), self.lr)
33
+
34
+ class ConvTDFNet(AbstractMDXNet):
35
+ def __init__(self, target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length,
36
+ num_blocks, l, g, k, bn, bias, overlap):
37
+
38
+ super(ConvTDFNet, self).__init__(
39
+ target_name, lr, optimizer, dim_c, dim_f, dim_t, n_fft, hop_length, overlap)
40
+ self.save_hyperparameters()
41
+
42
+ self.num_blocks = num_blocks
43
+ self.l = l
44
+ self.g = g
45
+ self.k = k
46
+ self.bn = bn
47
+ self.bias = bias
48
+
49
+ if optimizer == 'rmsprop':
50
+ norm = nn.BatchNorm2d
51
+
52
+ if optimizer == 'adamw':
53
+ norm = lambda input:nn.GroupNorm(2, input)
54
+
55
+ self.n = num_blocks // 2
56
+ scale = (2, 2)
57
+
58
+ self.first_conv = nn.Sequential(
59
+ nn.Conv2d(in_channels=self.dim_c, out_channels=g, kernel_size=(1, 1)),
60
+ norm(g),
61
+ nn.ReLU(),
62
+ )
63
+
64
+ f = self.dim_f
65
+ c = g
66
+ self.encoding_blocks = nn.ModuleList()
67
+ self.ds = nn.ModuleList()
68
+ for i in range(self.n):
69
+ self.encoding_blocks.append(TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm))
70
+ self.ds.append(
71
+ nn.Sequential(
72
+ nn.Conv2d(in_channels=c, out_channels=c + g, kernel_size=scale, stride=scale),
73
+ norm(c + g),
74
+ nn.ReLU()
75
+ )
76
+ )
77
+ f = f // 2
78
+ c += g
79
+
80
+ self.bottleneck_block = TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm)
81
+
82
+ self.decoding_blocks = nn.ModuleList()
83
+ self.us = nn.ModuleList()
84
+ for i in range(self.n):
85
+ self.us.append(
86
+ nn.Sequential(
87
+ nn.ConvTranspose2d(in_channels=c, out_channels=c - g, kernel_size=scale, stride=scale),
88
+ norm(c - g),
89
+ nn.ReLU()
90
+ )
91
+ )
92
+ f = f * 2
93
+ c -= g
94
+
95
+ self.decoding_blocks.append(TFC_TDF(c, l, f, k, bn, bias=bias, norm=norm))
96
+
97
+ self.final_conv = nn.Sequential(
98
+ nn.Conv2d(in_channels=c, out_channels=self.dim_c, kernel_size=(1, 1)),
99
+ )
100
+
101
+ def forward(self, x):
102
+
103
+ x = self.first_conv(x)
104
+
105
+ x = x.transpose(-1, -2)
106
+
107
+ ds_outputs = []
108
+ for i in range(self.n):
109
+ x = self.encoding_blocks[i](x)
110
+ ds_outputs.append(x)
111
+ x = self.ds[i](x)
112
+
113
+ x = self.bottleneck_block(x)
114
+
115
+ for i in range(self.n):
116
+ x = self.us[i](x)
117
+ x *= ds_outputs[-i - 1]
118
+ x = self.decoding_blocks[i](x)
119
+
120
+ x = x.transpose(-1, -2)
121
+
122
+ x = self.final_conv(x)
123
+
124
+ return x
125
+
126
+ class Mixer(nn.Module):
127
+ def __init__(self, device, mixer_path):
128
+
129
+ super(Mixer, self).__init__()
130
+
131
+ self.linear = nn.Linear((dim_s+1)*2, dim_s*2, bias=False)
132
+
133
+ self.load_state_dict(
134
+ torch.load(mixer_path, map_location=device)
135
+ )
136
+
137
+ def forward(self, x):
138
+ x = x.reshape(1,(dim_s+1)*2,-1).transpose(-1,-2)
139
+ x = self.linear(x)
140
+ return x.transpose(-1,-2).reshape(dim_s,2,-1)
uvr5/lib_v5/mixer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea781bd52c6a523b825fa6cdbb6189f52e318edd8b17e6fe404f76f7af8caa9c
3
+ size 1208
uvr5/lib_v5/modules.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+ class TFC(nn.Module):
6
+ def __init__(self, c, l, k, norm):
7
+ super(TFC, self).__init__()
8
+
9
+ self.H = nn.ModuleList()
10
+ for i in range(l):
11
+ self.H.append(
12
+ nn.Sequential(
13
+ nn.Conv2d(in_channels=c, out_channels=c, kernel_size=k, stride=1, padding=k // 2),
14
+ norm(c),
15
+ nn.ReLU(),
16
+ )
17
+ )
18
+
19
+ def forward(self, x):
20
+ for h in self.H:
21
+ x = h(x)
22
+ return x
23
+
24
+
25
+ class DenseTFC(nn.Module):
26
+ def __init__(self, c, l, k, norm):
27
+ super(DenseTFC, self).__init__()
28
+
29
+ self.conv = nn.ModuleList()
30
+ for i in range(l):
31
+ self.conv.append(
32
+ nn.Sequential(
33
+ nn.Conv2d(in_channels=c, out_channels=c, kernel_size=k, stride=1, padding=k // 2),
34
+ norm(c),
35
+ nn.ReLU(),
36
+ )
37
+ )
38
+
39
+ def forward(self, x):
40
+ for layer in self.conv[:-1]:
41
+ x = torch.cat([layer(x), x], 1)
42
+ return self.conv[-1](x)
43
+
44
+
45
+ class TFC_TDF(nn.Module):
46
+ def __init__(self, c, l, f, k, bn, dense=False, bias=True, norm=nn.BatchNorm2d):
47
+
48
+ super(TFC_TDF, self).__init__()
49
+
50
+ self.use_tdf = bn is not None
51
+
52
+ self.tfc = DenseTFC(c, l, k, norm) if dense else TFC(c, l, k, norm)
53
+
54
+ if self.use_tdf:
55
+ if bn == 0:
56
+ self.tdf = nn.Sequential(
57
+ nn.Linear(f, f, bias=bias),
58
+ norm(c),
59
+ nn.ReLU()
60
+ )
61
+ else:
62
+ self.tdf = nn.Sequential(
63
+ nn.Linear(f, f // bn, bias=bias),
64
+ norm(c),
65
+ nn.ReLU(),
66
+ nn.Linear(f // bn, f, bias=bias),
67
+ norm(c),
68
+ nn.ReLU()
69
+ )
70
+
71
+ def forward(self, x):
72
+ x = self.tfc(x)
73
+ return x + self.tdf(x) if self.use_tdf else x
74
+
uvr5/lib_v5/pyrb.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import tempfile
4
+ import six
5
+ import numpy as np
6
+ import soundfile as sf
7
+ import sys
8
+
9
+ if getattr(sys, 'frozen', False):
10
+ BASE_PATH_RUB = sys._MEIPASS
11
+ else:
12
+ BASE_PATH_RUB = os.path.dirname(os.path.abspath(__file__))
13
+
14
+ __all__ = ['time_stretch', 'pitch_shift']
15
+
16
+ __RUBBERBAND_UTIL = os.path.join(BASE_PATH_RUB, 'rubberband')
17
+
18
+ if six.PY2:
19
+ DEVNULL = open(os.devnull, 'w')
20
+ else:
21
+ DEVNULL = subprocess.DEVNULL
22
+
23
+ def __rubberband(y, sr, **kwargs):
24
+
25
+ assert sr > 0
26
+
27
+ # Get the input and output tempfile
28
+ fd, infile = tempfile.mkstemp(suffix='.wav')
29
+ os.close(fd)
30
+ fd, outfile = tempfile.mkstemp(suffix='.wav')
31
+ os.close(fd)
32
+
33
+ # dump the audio
34
+ sf.write(infile, y, sr)
35
+
36
+ try:
37
+ # Execute rubberband
38
+ arguments = [__RUBBERBAND_UTIL, '-q']
39
+
40
+ for key, value in six.iteritems(kwargs):
41
+ arguments.append(str(key))
42
+ arguments.append(str(value))
43
+
44
+ arguments.extend([infile, outfile])
45
+
46
+ subprocess.check_call(arguments, stdout=DEVNULL, stderr=DEVNULL)
47
+
48
+ # Load the processed audio.
49
+ y_out, _ = sf.read(outfile, always_2d=True)
50
+
51
+ # make sure that output dimensions matches input
52
+ if y.ndim == 1:
53
+ y_out = np.squeeze(y_out)
54
+
55
+ except OSError as exc:
56
+ six.raise_from(RuntimeError('Failed to execute rubberband. '
57
+ 'Please verify that rubberband-cli '
58
+ 'is installed.'),
59
+ exc)
60
+
61
+ finally:
62
+ # Remove temp files
63
+ os.unlink(infile)
64
+ os.unlink(outfile)
65
+
66
+ return y_out
67
+
68
+ def time_stretch(y, sr, rate, rbargs=None):
69
+ if rate <= 0:
70
+ raise ValueError('rate must be strictly positive')
71
+
72
+ if rate == 1.0:
73
+ return y
74
+
75
+ if rbargs is None:
76
+ rbargs = dict()
77
+
78
+ rbargs.setdefault('--tempo', rate)
79
+
80
+ return __rubberband(y, sr, **rbargs)
81
+
82
+ def pitch_shift(y, sr, n_steps, rbargs=None):
83
+
84
+ if n_steps == 0:
85
+ return y
86
+
87
+ if rbargs is None:
88
+ rbargs = dict()
89
+
90
+ rbargs.setdefault('--pitch', n_steps)
91
+
92
+ return __rubberband(y, sr, **rbargs)
uvr5/lib_v5/spec_utils.py ADDED
@@ -0,0 +1,703 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import soundfile as sf
4
+ import math
5
+ import random
6
+ import math
7
+ import platform
8
+ import traceback
9
+ from . import pyrb
10
+ #cur
11
+ OPERATING_SYSTEM = platform.system()
12
+ SYSTEM_ARCH = platform.platform()
13
+ SYSTEM_PROC = platform.processor()
14
+ ARM = 'arm'
15
+
16
+ if OPERATING_SYSTEM == 'Windows':
17
+ from pyrubberband import pyrb
18
+ else:
19
+ from . import pyrb
20
+
21
+ if OPERATING_SYSTEM == 'Darwin':
22
+ wav_resolution = "polyphase" if SYSTEM_PROC == ARM or ARM in SYSTEM_ARCH else "sinc_fastest"
23
+ else:
24
+ wav_resolution = "sinc_fastest"
25
+
26
+ MAX_SPEC = 'Max Spec'
27
+ MIN_SPEC = 'Min Spec'
28
+ AVERAGE = 'Average'
29
+
30
+ def crop_center(h1, h2):
31
+ h1_shape = h1.size()
32
+ h2_shape = h2.size()
33
+
34
+ if h1_shape[3] == h2_shape[3]:
35
+ return h1
36
+ elif h1_shape[3] < h2_shape[3]:
37
+ raise ValueError('h1_shape[3] must be greater than h2_shape[3]')
38
+
39
+ s_time = (h1_shape[3] - h2_shape[3]) // 2
40
+ e_time = s_time + h2_shape[3]
41
+ h1 = h1[:, :, :, s_time:e_time]
42
+
43
+ return h1
44
+
45
+ def preprocess(X_spec):
46
+ X_mag = np.abs(X_spec)
47
+ X_phase = np.angle(X_spec)
48
+
49
+ return X_mag, X_phase
50
+
51
+ def make_padding(width, cropsize, offset):
52
+ left = offset
53
+ roi_size = cropsize - offset * 2
54
+ if roi_size == 0:
55
+ roi_size = cropsize
56
+ right = roi_size - (width % roi_size) + left
57
+
58
+ return left, right, roi_size
59
+
60
+ def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
61
+ if reverse:
62
+ wave_left = np.flip(np.asfortranarray(wave[0]))
63
+ wave_right = np.flip(np.asfortranarray(wave[1]))
64
+ elif mid_side:
65
+ wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
66
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
67
+ elif mid_side_b2:
68
+ wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
69
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
70
+ else:
71
+ wave_left = np.asfortranarray(wave[0])
72
+ wave_right = np.asfortranarray(wave[1])
73
+
74
+ spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
75
+ spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
76
+
77
+ spec = np.asfortranarray([spec_left, spec_right])
78
+
79
+ return spec
80
+
81
+ def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
82
+ import threading
83
+
84
+ if reverse:
85
+ wave_left = np.flip(np.asfortranarray(wave[0]))
86
+ wave_right = np.flip(np.asfortranarray(wave[1]))
87
+ elif mid_side:
88
+ wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
89
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
90
+ elif mid_side_b2:
91
+ wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
92
+ wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
93
+ else:
94
+ wave_left = np.asfortranarray(wave[0])
95
+ wave_right = np.asfortranarray(wave[1])
96
+
97
+ def run_thread(**kwargs):
98
+ global spec_left
99
+ spec_left = librosa.stft(**kwargs)
100
+
101
+ thread = threading.Thread(target=run_thread, kwargs={'y': wave_left, 'n_fft': n_fft, 'hop_length': hop_length})
102
+ thread.start()
103
+ # print(wave_right.shape, n_fft, hop_length)
104
+ spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length)
105
+ thread.join()
106
+
107
+ spec = np.asfortranarray([spec_left, spec_right])
108
+
109
+ return spec
110
+
111
+ def normalize(wave, is_normalize=False):
112
+ """Save output music files"""
113
+ maxv = np.max(np.abs(wave))
114
+ if maxv > 1.0:
115
+ print(f"\nNormalization Set {is_normalize}: Input above threshold for clipping. Max:{maxv}")
116
+ if is_normalize:
117
+ print(f"The result was normalized.")
118
+ wave /= maxv
119
+ else:
120
+ print(f"The result was not normalized.")
121
+ else:
122
+ print(f"\nNormalization Set {is_normalize}: Input not above threshold for clipping. Max:{maxv}")
123
+ # stereo to mono
124
+ if wave.shape[1] < wave.shape[0]:
125
+ wave = np.mean(wave, axis=1)
126
+ else:
127
+ wave = np.mean(wave, axis=0)
128
+ return wave
129
+
130
+ def normalize_two_stem(wave, mix, is_normalize=False):
131
+ """Save output music files"""
132
+
133
+ maxv = np.abs(wave).max()
134
+ max_mix = np.abs(mix).max()
135
+
136
+ if maxv > 1.0:
137
+ print(f"\nNormalization Set {is_normalize}: Primary source above threshold for clipping. Max:{maxv}")
138
+ print(f"\nNormalization Set {is_normalize}: Mixture above threshold for clipping. Max:{max_mix}")
139
+ if is_normalize:
140
+ print(f"The result was normalized.")
141
+ wave /= maxv
142
+ mix /= maxv
143
+ else:
144
+ print(f"The result was not normalized.")
145
+ else:
146
+ print(f"\nNormalization Set {is_normalize}: Input not above threshold for clipping. Max:{maxv}")
147
+
148
+
149
+ print(f"\nNormalization Set {is_normalize}: Primary source - Max:{np.abs(wave).max()}")
150
+ print(f"\nNormalization Set {is_normalize}: Mixture - Max:{np.abs(mix).max()}")
151
+
152
+ return wave, mix
153
+
154
+ def combine_spectrograms(specs, mp):
155
+ l = min([specs[i].shape[2] for i in specs])
156
+ spec_c = np.zeros(shape=(2, mp.param['bins'] + 1, l), dtype=np.complex64)
157
+ offset = 0
158
+ bands_n = len(mp.param['band'])
159
+
160
+ for d in range(1, bands_n + 1):
161
+ h = mp.param['band'][d]['crop_stop'] - mp.param['band'][d]['crop_start']
162
+ spec_c[:, offset:offset+h, :l] = specs[d][:, mp.param['band'][d]['crop_start']:mp.param['band'][d]['crop_stop'], :l]
163
+ offset += h
164
+
165
+ if offset > mp.param['bins']:
166
+ raise ValueError('Too much bins')
167
+
168
+ # lowpass fiter
169
+ if mp.param['pre_filter_start'] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
170
+ if bands_n == 1:
171
+ spec_c = fft_lp_filter(spec_c, mp.param['pre_filter_start'], mp.param['pre_filter_stop'])
172
+ else:
173
+ gp = 1
174
+ for b in range(mp.param['pre_filter_start'] + 1, mp.param['pre_filter_stop']):
175
+ g = math.pow(10, -(b - mp.param['pre_filter_start']) * (3.5 - gp) / 20.0)
176
+ gp = g
177
+ spec_c[:, b, :] *= g
178
+
179
+ return np.asfortranarray(spec_c)
180
+
181
+ def spectrogram_to_image(spec, mode='magnitude'):
182
+ if mode == 'magnitude':
183
+ if np.iscomplexobj(spec):
184
+ y = np.abs(spec)
185
+ else:
186
+ y = spec
187
+ y = np.log10(y ** 2 + 1e-8)
188
+ elif mode == 'phase':
189
+ if np.iscomplexobj(spec):
190
+ y = np.angle(spec)
191
+ else:
192
+ y = spec
193
+
194
+ y -= y.min()
195
+ y *= 255 / y.max()
196
+ img = np.uint8(y)
197
+
198
+ if y.ndim == 3:
199
+ img = img.transpose(1, 2, 0)
200
+ img = np.concatenate([
201
+ np.max(img, axis=2, keepdims=True), img
202
+ ], axis=2)
203
+
204
+ return img
205
+
206
+ def reduce_vocal_aggressively(X, y, softmask):
207
+ v = X - y
208
+ y_mag_tmp = np.abs(y)
209
+ v_mag_tmp = np.abs(v)
210
+
211
+ v_mask = v_mag_tmp > y_mag_tmp
212
+ y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
213
+
214
+ return y_mag * np.exp(1.j * np.angle(y))
215
+
216
+ def merge_artifacts(y_mask, thres=0.01, min_range=64, fade_size=32):
217
+ mask = y_mask
218
+
219
+ try:
220
+ if min_range < fade_size * 2:
221
+ raise ValueError('min_range must be >= fade_size * 2')
222
+
223
+ idx = np.where(y_mask.min(axis=(0, 1)) > thres)[0]
224
+ start_idx = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
225
+ end_idx = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
226
+ artifact_idx = np.where(end_idx - start_idx > min_range)[0]
227
+ weight = np.zeros_like(y_mask)
228
+ if len(artifact_idx) > 0:
229
+ start_idx = start_idx[artifact_idx]
230
+ end_idx = end_idx[artifact_idx]
231
+ old_e = None
232
+ for s, e in zip(start_idx, end_idx):
233
+ if old_e is not None and s - old_e < fade_size:
234
+ s = old_e - fade_size * 2
235
+
236
+ if s != 0:
237
+ weight[:, :, s:s + fade_size] = np.linspace(0, 1, fade_size)
238
+ else:
239
+ s -= fade_size
240
+
241
+ if e != y_mask.shape[2]:
242
+ weight[:, :, e - fade_size:e] = np.linspace(1, 0, fade_size)
243
+ else:
244
+ e += fade_size
245
+
246
+ weight[:, :, s + fade_size:e - fade_size] = 1
247
+ old_e = e
248
+
249
+ v_mask = 1 - y_mask
250
+ y_mask += weight * v_mask
251
+
252
+ mask = y_mask
253
+ except Exception as e:
254
+ error_name = f'{type(e).__name__}'
255
+ traceback_text = ''.join(traceback.format_tb(e.__traceback__))
256
+ message = f'{error_name}: "{e}"\n{traceback_text}"'
257
+ print('Post Process Failed: ', message)
258
+
259
+
260
+ return mask
261
+
262
+ def align_wave_head_and_tail(a, b):
263
+ l = min([a[0].size, b[0].size])
264
+
265
+ return a[:l,:l], b[:l,:l]
266
+
267
+ def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse, clamp=False):
268
+ spec_left = np.asfortranarray(spec[0])
269
+ spec_right = np.asfortranarray(spec[1])
270
+
271
+ wave_left = librosa.istft(spec_left, hop_length=hop_length)
272
+ wave_right = librosa.istft(spec_right, hop_length=hop_length)
273
+
274
+ if reverse:
275
+ return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
276
+ elif mid_side:
277
+ return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
278
+ elif mid_side_b2:
279
+ return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
280
+ else:
281
+ return np.asfortranarray([wave_left, wave_right])
282
+
283
+ def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
284
+ import threading
285
+
286
+ spec_left = np.asfortranarray(spec[0])
287
+ spec_right = np.asfortranarray(spec[1])
288
+
289
+ def run_thread(**kwargs):
290
+ global wave_left
291
+ wave_left = librosa.istft(**kwargs)
292
+
293
+ thread = threading.Thread(target=run_thread, kwargs={'stft_matrix': spec_left, 'hop_length': hop_length})
294
+ thread.start()
295
+ wave_right = librosa.istft(spec_right, hop_length=hop_length)
296
+ thread.join()
297
+
298
+ if reverse:
299
+ return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
300
+ elif mid_side:
301
+ return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
302
+ elif mid_side_b2:
303
+ return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
304
+ else:
305
+ return np.asfortranarray([wave_left, wave_right])
306
+
307
+ def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
308
+ bands_n = len(mp.param['band'])
309
+ offset = 0
310
+ # print('spec_m: ', spec_m.shape, np.max(spec_m), np.min(spec_m))
311
+ for d in range(1, bands_n + 1):
312
+ bp = mp.param['band'][d]
313
+ spec_s = np.ndarray(shape=(2, bp['n_fft'] // 2 + 1, spec_m.shape[2]), dtype=complex)
314
+ h = bp['crop_stop'] - bp['crop_start']
315
+ spec_s[:, bp['crop_start']:bp['crop_stop'], :] = spec_m[:, offset:offset+h, :]
316
+ # print('\nbp', d, bands_n, bp)
317
+ # print('spec_s: ', spec_s.shape, np.max(spec_s), np.min(spec_s))
318
+ offset += h
319
+ if d == bands_n: # higher
320
+ # print('hpf_start: ', extra_bins_h, bp['hpf_start'])
321
+ if extra_bins_h: # if --high_end_process bypass
322
+ max_bin = bp['n_fft'] // 2
323
+ spec_s[:, max_bin-extra_bins_h:max_bin, :] = extra_bins[:, :extra_bins_h, :]
324
+ # print('extra_bins_h, max_bin, extra_bins: ', extra_bins_h, max_bin, extra_bins.shape, np.max(extra_bins), np.min(extra_bins))
325
+ # print('spec_s d=4: ', spec_s.shape, np.max(spec_s), np.min(spec_s))
326
+ if bp['hpf_start'] > 0:
327
+ spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
328
+ # print('spec_s fft: ', spec_s.shape, np.max(spec_s), np.min(spec_s) )
329
+ if bands_n == 1:
330
+ wave = spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
331
+ else:
332
+ wave = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
333
+ else:
334
+ sr = mp.param['band'][d+1]['sr']
335
+ if d == 1: # lower
336
+ spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'] - 1) # test
337
+ spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
338
+ wave = librosa.resample(spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']), bp['sr'], sr, res_type=wav_resolution)
339
+ else: # mid
340
+ spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
341
+ spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
342
+ wave2 = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
343
+ wave = librosa.resample(wave2, bp['sr'], sr, res_type=wav_resolution)
344
+ # print('spec to wav shape: ', d, wave.shape, np.max(wave), np.min(wave), spec_s.shape, np.max(spec_s), np.min(spec_s))
345
+ return wave
346
+
347
+ def fft_lp_filter(spec, bin_start, bin_stop):
348
+ g = 1.0
349
+ for b in range(bin_start, bin_stop):
350
+ g -= 1 / (bin_stop - bin_start)
351
+ spec[:, b, :] = g * spec[:, b, :]
352
+
353
+ spec[:, bin_stop:, :] *= 0
354
+
355
+ return spec
356
+
357
+ def fft_hp_filter(spec, bin_start, bin_stop):
358
+ g = 1.0
359
+ for b in range(bin_start, bin_stop, -1):
360
+ g -= 1 / (bin_start - bin_stop)
361
+ spec[:, b, :] = g * spec[:, b, :]
362
+
363
+ spec[:, 0:bin_stop+1, :] *= 0
364
+
365
+ return spec
366
+
367
+ def mirroring(a, spec_m, input_high_end, mp):
368
+ if 'mirroring' == a:
369
+ mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
370
+ mirror = mirror * np.exp(1.j * np.angle(input_high_end))
371
+
372
+ return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
373
+
374
+ if 'mirroring2' == a:
375
+ mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
376
+ mi = np.multiply(mirror, input_high_end * 1.7)
377
+
378
+ return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
379
+
380
+ def adjust_aggr(mask, is_non_accom_stem, aggressiveness):
381
+ aggr = aggressiveness['value']
382
+
383
+ if aggr != 0:
384
+ if is_non_accom_stem:
385
+ aggr = 1 - aggr
386
+
387
+ aggr = [aggr, aggr]
388
+
389
+ if aggressiveness['aggr_correction'] is not None:
390
+ aggr[0] += aggressiveness['aggr_correction']['left']
391
+ aggr[1] += aggressiveness['aggr_correction']['right']
392
+
393
+ for ch in range(2):
394
+ mask[ch, :aggressiveness['split_bin']] = np.power(mask[ch, :aggressiveness['split_bin']], 1 + aggr[ch] / 3)
395
+ mask[ch, aggressiveness['split_bin']:] = np.power(mask[ch, aggressiveness['split_bin']:], 1 + aggr[ch])
396
+
397
+ # if is_non_accom_stem:
398
+ # mask = (1.0 - mask)
399
+
400
+ return mask
401
+
402
+ def stft(wave, nfft, hl):
403
+ wave_left = np.asfortranarray(wave[0])
404
+ wave_right = np.asfortranarray(wave[1])
405
+ spec_left = librosa.stft(wave_left, nfft, hop_length=hl)
406
+ spec_right = librosa.stft(wave_right, nfft, hop_length=hl)
407
+ spec = np.asfortranarray([spec_left, spec_right])
408
+
409
+ return spec
410
+
411
+ def istft(spec, hl):
412
+ spec_left = np.asfortranarray(spec[0])
413
+ spec_right = np.asfortranarray(spec[1])
414
+ wave_left = librosa.istft(spec_left, hop_length=hl)
415
+ wave_right = librosa.istft(spec_right, hop_length=hl)
416
+ wave = np.asfortranarray([wave_left, wave_right])
417
+
418
+ return wave
419
+
420
+ def spec_effects(wave, algorithm='Default', value=None):
421
+ spec = [stft(wave[0],2048,1024), stft(wave[1],2048,1024)]
422
+ if algorithm == 'Min_Mag':
423
+ v_spec_m = np.where(np.abs(spec[1]) <= np.abs(spec[0]), spec[1], spec[0])
424
+ wave = istft(v_spec_m,1024)
425
+ elif algorithm == 'Max_Mag':
426
+ v_spec_m = np.where(np.abs(spec[1]) >= np.abs(spec[0]), spec[1], spec[0])
427
+ wave = istft(v_spec_m,1024)
428
+ elif algorithm == 'Default':
429
+ wave = (wave[1] * value) + (wave[0] * (1-value))
430
+ elif algorithm == 'Invert_p':
431
+ X_mag = np.abs(spec[0])
432
+ y_mag = np.abs(spec[1])
433
+ max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
434
+ v_spec = spec[1] - max_mag * np.exp(1.j * np.angle(spec[0]))
435
+ wave = istft(v_spec,1024)
436
+
437
+ return wave
438
+
439
+ def spectrogram_to_wave_no_mp(spec, n_fft=2048, hop_length=1024):
440
+ wave = librosa.istft(spec, n_fft=n_fft, hop_length=hop_length)
441
+
442
+ if wave.ndim == 1:
443
+ wave = np.asfortranarray([wave,wave])
444
+
445
+ return wave
446
+
447
+ def wave_to_spectrogram_no_mp(wave):
448
+
449
+ spec = librosa.stft(wave, n_fft=2048, hop_length=1024)
450
+
451
+ if spec.ndim == 1:
452
+ spec = np.asfortranarray([spec,spec])
453
+
454
+ return spec
455
+
456
+ def invert_audio(specs, invert_p=True):
457
+
458
+ ln = min([specs[0].shape[2], specs[1].shape[2]])
459
+ specs[0] = specs[0][:,:,:ln]
460
+ specs[1] = specs[1][:,:,:ln]
461
+
462
+ if invert_p:
463
+ X_mag = np.abs(specs[0])
464
+ y_mag = np.abs(specs[1])
465
+ max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
466
+ v_spec = specs[1] - max_mag * np.exp(1.j * np.angle(specs[0]))
467
+ else:
468
+ specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
469
+ v_spec = specs[0] - specs[1]
470
+
471
+ return v_spec
472
+
473
+ def invert_stem(mixture, stem):
474
+
475
+ mixture = wave_to_spectrogram_no_mp(mixture)
476
+ stem = wave_to_spectrogram_no_mp(stem)
477
+ output = spectrogram_to_wave_no_mp(invert_audio([mixture, stem]))
478
+
479
+ return -output.T
480
+
481
+ def ensembling(a, specs):
482
+ for i in range(1, len(specs)):
483
+ if i == 1:
484
+ spec = specs[0]
485
+
486
+ ln = min([spec.shape[2], specs[i].shape[2]])
487
+ spec = spec[:,:,:ln]
488
+ specs[i] = specs[i][:,:,:ln]
489
+
490
+ if MIN_SPEC == a:
491
+ spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
492
+ if MAX_SPEC == a:
493
+ spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
494
+ if AVERAGE == a:
495
+ spec = np.where(np.abs(specs[i]) == np.abs(spec), specs[i], spec)
496
+
497
+ return spec
498
+
499
+ def ensemble_inputs(audio_input, algorithm, is_normalization, wav_type_set, save_path):
500
+
501
+ wavs_ = []
502
+
503
+ if algorithm == AVERAGE:
504
+ output = average_audio(audio_input)
505
+ samplerate = 44100
506
+ else:
507
+ specs = []
508
+
509
+ for i in range(len(audio_input)):
510
+ wave, samplerate = librosa.load(audio_input[i], mono=False, sr=44100)
511
+ wavs_.append(wave)
512
+ spec = wave_to_spectrogram_no_mp(wave)
513
+ specs.append(spec)
514
+
515
+ wave_shapes = [w.shape[1] for w in wavs_]
516
+ target_shape = wavs_[wave_shapes.index(max(wave_shapes))]
517
+
518
+ output = spectrogram_to_wave_no_mp(ensembling(algorithm, specs))
519
+ output = to_shape(output, target_shape.shape)
520
+
521
+ sf.write(save_path, normalize(output.T, is_normalization), samplerate, subtype=wav_type_set)
522
+
523
+ def to_shape(x, target_shape):
524
+ padding_list = []
525
+ for x_dim, target_dim in zip(x.shape, target_shape):
526
+ pad_value = (target_dim - x_dim)
527
+ pad_tuple = ((0, pad_value))
528
+ padding_list.append(pad_tuple)
529
+
530
+ return np.pad(x, tuple(padding_list), mode='constant')
531
+
532
+ def to_shape_minimize(x: np.ndarray, target_shape):
533
+
534
+ padding_list = []
535
+ for x_dim, target_dim in zip(x.shape, target_shape):
536
+ pad_value = (target_dim - x_dim)
537
+ pad_tuple = ((0, pad_value))
538
+ padding_list.append(pad_tuple)
539
+
540
+ return np.pad(x, tuple(padding_list), mode='constant')
541
+
542
+ def augment_audio(export_path, audio_file, rate, is_normalization, wav_type_set, save_format=None, is_pitch=False):
543
+
544
+ wav, sr = librosa.load(audio_file, sr=44100, mono=False)
545
+
546
+ if wav.ndim == 1:
547
+ wav = np.asfortranarray([wav,wav])
548
+
549
+ if is_pitch:
550
+ wav_1 = pyrb.pitch_shift(wav[0], sr, rate, rbargs=None)
551
+ wav_2 = pyrb.pitch_shift(wav[1], sr, rate, rbargs=None)
552
+ else:
553
+ wav_1 = pyrb.time_stretch(wav[0], sr, rate, rbargs=None)
554
+ wav_2 = pyrb.time_stretch(wav[1], sr, rate, rbargs=None)
555
+
556
+ if wav_1.shape > wav_2.shape:
557
+ wav_2 = to_shape(wav_2, wav_1.shape)
558
+ if wav_1.shape < wav_2.shape:
559
+ wav_1 = to_shape(wav_1, wav_2.shape)
560
+
561
+ wav_mix = np.asfortranarray([wav_1, wav_2])
562
+
563
+ sf.write(export_path, normalize(wav_mix.T, is_normalization), sr, subtype=wav_type_set)
564
+ save_format(export_path)
565
+
566
+ def average_audio(audio):
567
+
568
+ waves = []
569
+ wave_shapes = []
570
+ final_waves = []
571
+
572
+ for i in range(len(audio)):
573
+ wave = librosa.load(audio[i], sr=44100, mono=False)
574
+ waves.append(wave[0])
575
+ wave_shapes.append(wave[0].shape[1])
576
+
577
+ wave_shapes_index = wave_shapes.index(max(wave_shapes))
578
+ target_shape = waves[wave_shapes_index]
579
+ waves.pop(wave_shapes_index)
580
+ final_waves.append(target_shape)
581
+
582
+ for n_array in waves:
583
+ wav_target = to_shape(n_array, target_shape.shape)
584
+ final_waves.append(wav_target)
585
+
586
+ waves = sum(final_waves)
587
+ waves = waves/len(audio)
588
+
589
+ return waves
590
+
591
+ def average_dual_sources(wav_1, wav_2, value):
592
+
593
+ if wav_1.shape > wav_2.shape:
594
+ wav_2 = to_shape(wav_2, wav_1.shape)
595
+ if wav_1.shape < wav_2.shape:
596
+ wav_1 = to_shape(wav_1, wav_2.shape)
597
+
598
+ wave = (wav_1 * value) + (wav_2 * (1-value))
599
+
600
+ return wave
601
+
602
+ def reshape_sources(wav_1: np.ndarray, wav_2: np.ndarray):
603
+
604
+ if wav_1.shape > wav_2.shape:
605
+ wav_2 = to_shape(wav_2, wav_1.shape)
606
+ if wav_1.shape < wav_2.shape:
607
+ ln = min([wav_1.shape[1], wav_2.shape[1]])
608
+ wav_2 = wav_2[:,:ln]
609
+
610
+ ln = min([wav_1.shape[1], wav_2.shape[1]])
611
+ wav_1 = wav_1[:,:ln]
612
+ wav_2 = wav_2[:,:ln]
613
+
614
+ return wav_2
615
+
616
+ def align_audio(file1, file2, file2_aligned, file_subtracted, wav_type_set, is_normalization, command_Text, progress_bar_main_var, save_format):
617
+ def get_diff(a, b):
618
+ corr = np.correlate(a, b, "full")
619
+ diff = corr.argmax() - (b.shape[0] - 1)
620
+ return diff
621
+
622
+ progress_bar_main_var.set(10)
623
+
624
+ # read tracks
625
+ wav1, sr1 = librosa.load(file1, sr=44100, mono=False)
626
+ wav2, sr2 = librosa.load(file2, sr=44100, mono=False)
627
+ wav1 = wav1.transpose()
628
+ wav2 = wav2.transpose()
629
+
630
+ command_Text(f"Audio file shapes: {wav1.shape} / {wav2.shape}\n")
631
+
632
+ wav2_org = wav2.copy()
633
+ progress_bar_main_var.set(20)
634
+
635
+ command_Text("Processing files... \n")
636
+
637
+ # pick random position and get diff
638
+
639
+ counts = {} # counting up for each diff value
640
+ progress = 20
641
+
642
+ check_range = 64
643
+
644
+ base = (64 / check_range)
645
+
646
+ for i in range(check_range):
647
+ index = int(random.uniform(44100 * 2, min(wav1.shape[0], wav2.shape[0]) - 44100 * 2))
648
+ shift = int(random.uniform(-22050,+22050))
649
+ samp1 = wav1[index :index +44100, 0] # currently use left channel
650
+ samp2 = wav2[index+shift:index+shift+44100, 0]
651
+ progress += 1 * base
652
+ progress_bar_main_var.set(progress)
653
+ diff = get_diff(samp1, samp2)
654
+ diff -= shift
655
+
656
+ if abs(diff) < 22050:
657
+ if not diff in counts:
658
+ counts[diff] = 0
659
+ counts[diff] += 1
660
+
661
+ # use max counted diff value
662
+ max_count = 0
663
+ est_diff = 0
664
+ for diff in counts.keys():
665
+ if counts[diff] > max_count:
666
+ max_count = counts[diff]
667
+ est_diff = diff
668
+
669
+ command_Text(f"Estimated difference is {est_diff} (count: {max_count})\n")
670
+
671
+ progress_bar_main_var.set(90)
672
+
673
+ audio_files = []
674
+
675
+ def save_aligned_audio(wav2_aligned):
676
+ command_Text(f"Aligned File 2 with File 1.\n")
677
+ command_Text(f"Saving files... ")
678
+ sf.write(file2_aligned, normalize(wav2_aligned, is_normalization), sr2, subtype=wav_type_set)
679
+ save_format(file2_aligned)
680
+ min_len = min(wav1.shape[0], wav2_aligned.shape[0])
681
+ wav_sub = wav1[:min_len] - wav2_aligned[:min_len]
682
+ audio_files.append(file2_aligned)
683
+ return min_len, wav_sub
684
+
685
+ # make aligned track 2
686
+ if est_diff > 0:
687
+ wav2_aligned = np.append(np.zeros((est_diff, 2)), wav2_org, axis=0)
688
+ min_len, wav_sub = save_aligned_audio(wav2_aligned)
689
+ elif est_diff < 0:
690
+ wav2_aligned = wav2_org[-est_diff:]
691
+ min_len, wav_sub = save_aligned_audio(wav2_aligned)
692
+ else:
693
+ command_Text(f"Audio files already aligned.\n")
694
+ command_Text(f"Saving inverted track... ")
695
+ min_len = min(wav1.shape[0], wav2.shape[0])
696
+ wav_sub = wav1[:min_len] - wav2[:min_len]
697
+
698
+ wav_sub = np.clip(wav_sub, -1, +1)
699
+
700
+ sf.write(file_subtracted, normalize(wav_sub, is_normalization), sr1, subtype=wav_type_set)
701
+ save_format(file_subtracted)
702
+
703
+ progress_bar_main_var.set(95)
uvr5/lib_v5/vr_network/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # VR init.
uvr5/lib_v5/vr_network/layers.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from lib_v5 import spec_utils
6
+
7
+ class Conv2DBNActiv(nn.Module):
8
+
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin, nout,
14
+ kernel_size=ksize,
15
+ stride=stride,
16
+ padding=pad,
17
+ dilation=dilation,
18
+ bias=False),
19
+ nn.BatchNorm2d(nout),
20
+ activ()
21
+ )
22
+
23
+ def __call__(self, x):
24
+ return self.conv(x)
25
+
26
+ class SeperableConv2DBNActiv(nn.Module):
27
+
28
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
29
+ super(SeperableConv2DBNActiv, self).__init__()
30
+ self.conv = nn.Sequential(
31
+ nn.Conv2d(
32
+ nin, nin,
33
+ kernel_size=ksize,
34
+ stride=stride,
35
+ padding=pad,
36
+ dilation=dilation,
37
+ groups=nin,
38
+ bias=False),
39
+ nn.Conv2d(
40
+ nin, nout,
41
+ kernel_size=1,
42
+ bias=False),
43
+ nn.BatchNorm2d(nout),
44
+ activ()
45
+ )
46
+
47
+ def __call__(self, x):
48
+ return self.conv(x)
49
+
50
+
51
+ class Encoder(nn.Module):
52
+
53
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
+ super(Encoder, self).__init__()
55
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
+
58
+ def __call__(self, x):
59
+ skip = self.conv1(x)
60
+ h = self.conv2(skip)
61
+
62
+ return h, skip
63
+
64
+
65
+ class Decoder(nn.Module):
66
+
67
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
68
+ super(Decoder, self).__init__()
69
+ self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
70
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
71
+
72
+ def __call__(self, x, skip=None):
73
+ x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
74
+ if skip is not None:
75
+ skip = spec_utils.crop_center(skip, x)
76
+ x = torch.cat([x, skip], dim=1)
77
+ h = self.conv(x)
78
+
79
+ if self.dropout is not None:
80
+ h = self.dropout(h)
81
+
82
+ return h
83
+
84
+
85
+ class ASPPModule(nn.Module):
86
+
87
+ def __init__(self, nn_architecture, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
+ super(ASPPModule, self).__init__()
89
+ self.conv1 = nn.Sequential(
90
+ nn.AdaptiveAvgPool2d((1, None)),
91
+ Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
92
+ )
93
+
94
+ self.nn_architecture = nn_architecture
95
+ self.six_layer = [129605]
96
+ self.seven_layer = [537238, 537227, 33966]
97
+
98
+ extra_conv = SeperableConv2DBNActiv(
99
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
100
+
101
+ self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
102
+ self.conv3 = SeperableConv2DBNActiv(
103
+ nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
104
+ self.conv4 = SeperableConv2DBNActiv(
105
+ nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
106
+ self.conv5 = SeperableConv2DBNActiv(
107
+ nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
108
+
109
+ if self.nn_architecture in self.six_layer:
110
+ self.conv6 = extra_conv
111
+ nin_x = 6
112
+ elif self.nn_architecture in self.seven_layer:
113
+ self.conv6 = extra_conv
114
+ self.conv7 = extra_conv
115
+ nin_x = 7
116
+ else:
117
+ nin_x = 5
118
+
119
+ self.bottleneck = nn.Sequential(
120
+ Conv2DBNActiv(nin * nin_x, nout, 1, 1, 0, activ=activ),
121
+ nn.Dropout2d(0.1)
122
+ )
123
+
124
+ def forward(self, x):
125
+ _, _, h, w = x.size()
126
+ feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
127
+ feat2 = self.conv2(x)
128
+ feat3 = self.conv3(x)
129
+ feat4 = self.conv4(x)
130
+ feat5 = self.conv5(x)
131
+
132
+ if self.nn_architecture in self.six_layer:
133
+ feat6 = self.conv6(x)
134
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6), dim=1)
135
+ elif self.nn_architecture in self.seven_layer:
136
+ feat6 = self.conv6(x)
137
+ feat7 = self.conv7(x)
138
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
139
+ else:
140
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
141
+
142
+ bottle = self.bottleneck(out)
143
+ return bottle
uvr5/lib_v5/vr_network/layers_new.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from lib_v5 import spec_utils
6
+
7
+ class Conv2DBNActiv(nn.Module):
8
+
9
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
+ super(Conv2DBNActiv, self).__init__()
11
+ self.conv = nn.Sequential(
12
+ nn.Conv2d(
13
+ nin, nout,
14
+ kernel_size=ksize,
15
+ stride=stride,
16
+ padding=pad,
17
+ dilation=dilation,
18
+ bias=False),
19
+ nn.BatchNorm2d(nout),
20
+ activ()
21
+ )
22
+
23
+ def __call__(self, x):
24
+ return self.conv(x)
25
+
26
+ class Encoder(nn.Module):
27
+
28
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
29
+ super(Encoder, self).__init__()
30
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
31
+ self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
32
+
33
+ def __call__(self, x):
34
+ h = self.conv1(x)
35
+ h = self.conv2(h)
36
+
37
+ return h
38
+
39
+
40
+ class Decoder(nn.Module):
41
+
42
+ def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
43
+ super(Decoder, self).__init__()
44
+ self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
45
+ # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
46
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
47
+
48
+ def __call__(self, x, skip=None):
49
+ x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
50
+
51
+ if skip is not None:
52
+ skip = spec_utils.crop_center(skip, x)
53
+ x = torch.cat([x, skip], dim=1)
54
+
55
+ h = self.conv1(x)
56
+ # h = self.conv2(h)
57
+
58
+ if self.dropout is not None:
59
+ h = self.dropout(h)
60
+
61
+ return h
62
+
63
+
64
+ class ASPPModule(nn.Module):
65
+
66
+ def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
67
+ super(ASPPModule, self).__init__()
68
+ self.conv1 = nn.Sequential(
69
+ nn.AdaptiveAvgPool2d((1, None)),
70
+ Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
71
+ )
72
+ self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
73
+ self.conv3 = Conv2DBNActiv(
74
+ nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
75
+ )
76
+ self.conv4 = Conv2DBNActiv(
77
+ nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
78
+ )
79
+ self.conv5 = Conv2DBNActiv(
80
+ nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
81
+ )
82
+ self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
83
+ self.dropout = nn.Dropout2d(0.1) if dropout else None
84
+
85
+ def forward(self, x):
86
+ _, _, h, w = x.size()
87
+ feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
88
+ feat2 = self.conv2(x)
89
+ feat3 = self.conv3(x)
90
+ feat4 = self.conv4(x)
91
+ feat5 = self.conv5(x)
92
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
93
+ out = self.bottleneck(out)
94
+
95
+ if self.dropout is not None:
96
+ out = self.dropout(out)
97
+
98
+ return out
99
+
100
+
101
+ class LSTMModule(nn.Module):
102
+
103
+ def __init__(self, nin_conv, nin_lstm, nout_lstm):
104
+ super(LSTMModule, self).__init__()
105
+ self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
106
+ self.lstm = nn.LSTM(
107
+ input_size=nin_lstm,
108
+ hidden_size=nout_lstm // 2,
109
+ bidirectional=True
110
+ )
111
+ self.dense = nn.Sequential(
112
+ nn.Linear(nout_lstm, nin_lstm),
113
+ nn.BatchNorm1d(nin_lstm),
114
+ nn.ReLU()
115
+ )
116
+
117
+ def forward(self, x):
118
+ N, _, nbins, nframes = x.size()
119
+ h = self.conv(x)[:, 0] # N, nbins, nframes
120
+ h = h.permute(2, 0, 1) # nframes, N, nbins
121
+ h, _ = self.lstm(h)
122
+ h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins
123
+ h = h.reshape(nframes, N, 1, nbins)
124
+ h = h.permute(1, 2, 3, 0)
125
+
126
+ return h
uvr5/lib_v5/vr_network/model_param_init.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pathlib
3
+
4
+ default_param = {}
5
+ default_param['bins'] = 768
6
+ default_param['unstable_bins'] = 9 # training only
7
+ default_param['reduction_bins'] = 762 # training only
8
+ default_param['sr'] = 44100
9
+ default_param['pre_filter_start'] = 757
10
+ default_param['pre_filter_stop'] = 768
11
+ default_param['band'] = {}
12
+
13
+
14
+ default_param['band'][1] = {
15
+ 'sr': 11025,
16
+ 'hl': 128,
17
+ 'n_fft': 960,
18
+ 'crop_start': 0,
19
+ 'crop_stop': 245,
20
+ 'lpf_start': 61, # inference only
21
+ 'res_type': 'polyphase'
22
+ }
23
+
24
+ default_param['band'][2] = {
25
+ 'sr': 44100,
26
+ 'hl': 512,
27
+ 'n_fft': 1536,
28
+ 'crop_start': 24,
29
+ 'crop_stop': 547,
30
+ 'hpf_start': 81, # inference only
31
+ 'res_type': 'sinc_best'
32
+ }
33
+
34
+
35
+ def int_keys(d):
36
+ r = {}
37
+ for k, v in d:
38
+ if k.isdigit():
39
+ k = int(k)
40
+ r[k] = v
41
+ return r
42
+
43
+
44
+ class ModelParameters(object):
45
+ def __init__(self, config_path=''):
46
+ if '.pth' == pathlib.Path(config_path).suffix:
47
+ import zipfile
48
+
49
+ with zipfile.ZipFile(config_path, 'r') as zip:
50
+ self.param = json.loads(zip.read('param.json'), object_pairs_hook=int_keys)
51
+ elif '.json' == pathlib.Path(config_path).suffix:
52
+ with open(config_path, 'r') as f:
53
+ self.param = json.loads(f.read(), object_pairs_hook=int_keys)
54
+ else:
55
+ self.param = default_param
56
+
57
+ for k in ['mid_side', 'mid_side_b', 'mid_side_b2', 'stereo_w', 'stereo_n', 'reverse']:
58
+ if not k in self.param:
59
+ self.param[k] = False
uvr5/lib_v5/vr_network/modelparams/1band_sr16000_hl512.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 16000,
8
+ "hl": 512,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 16000,
17
+ "pre_filter_start": 1023,
18
+ "pre_filter_stop": 1024
19
+ }
uvr5/lib_v5/vr_network/modelparams/1band_sr32000_hl512.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 32000,
8
+ "hl": 512,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "kaiser_fast"
14
+ }
15
+ },
16
+ "sr": 32000,
17
+ "pre_filter_start": 1000,
18
+ "pre_filter_stop": 1021
19
+ }
uvr5/lib_v5/vr_network/modelparams/1band_sr33075_hl384.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 33075,
8
+ "hl": 384,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 33075,
17
+ "pre_filter_start": 1000,
18
+ "pre_filter_stop": 1021
19
+ }
uvr5/lib_v5/vr_network/modelparams/1band_sr44100_hl1024.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 44100,
8
+ "hl": 1024,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 44100,
17
+ "pre_filter_start": 1023,
18
+ "pre_filter_stop": 1024
19
+ }
uvr5/lib_v5/vr_network/modelparams/1band_sr44100_hl256.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 256,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 44100,
8
+ "hl": 256,
9
+ "n_fft": 512,
10
+ "crop_start": 0,
11
+ "crop_stop": 256,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 44100,
17
+ "pre_filter_start": 256,
18
+ "pre_filter_stop": 256
19
+ }
uvr5/lib_v5/vr_network/modelparams/1band_sr44100_hl512.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bins": 1024,
3
+ "unstable_bins": 0,
4
+ "reduction_bins": 0,
5
+ "band": {
6
+ "1": {
7
+ "sr": 44100,
8
+ "hl": 512,
9
+ "n_fft": 2048,
10
+ "crop_start": 0,
11
+ "crop_stop": 1024,
12
+ "hpf_start": -1,
13
+ "res_type": "sinc_best"
14
+ }
15
+ },
16
+ "sr": 44100,
17
+ "pre_filter_start": 1023,
18
+ "pre_filter_stop": 1024
19
+ }