<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://tts.wiki/index.php?action=history&amp;feed=atom&amp;title=Emilia_Dataset</id>
	<title>Emilia Dataset - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://tts.wiki/index.php?action=history&amp;feed=atom&amp;title=Emilia_Dataset"/>
	<link rel="alternate" type="text/html" href="https://tts.wiki/index.php?title=Emilia_Dataset&amp;action=history"/>
	<updated>2026-04-03T19:02:14Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.41.5</generator>
	<entry>
		<id>https://tts.wiki/index.php?title=Emilia_Dataset&amp;diff=7&amp;oldid=prev</id>
		<title>Ttswikiadmin: Remove related links</title>
		<link rel="alternate" type="text/html" href="https://tts.wiki/index.php?title=Emilia_Dataset&amp;diff=7&amp;oldid=prev"/>
		<updated>2025-09-19T03:43:07Z</updated>

		<summary type="html">&lt;p&gt;Remove related links&lt;/p&gt;
&lt;table style=&quot;background-color: #fff; color: #202122;&quot; data-mw=&quot;interface&quot;&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;col class=&quot;diff-marker&quot; /&gt;
				&lt;col class=&quot;diff-content&quot; /&gt;
				&lt;tr class=&quot;diff-title&quot; lang=&quot;en&quot;&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #202122; text-align: center;&quot;&gt;← Older revision&lt;/td&gt;
				&lt;td colspan=&quot;2&quot; style=&quot;background-color: #fff; color: #202122; text-align: center;&quot;&gt;Revision as of 03:43, 19 September 2025&lt;/td&gt;
				&lt;/tr&gt;&lt;tr&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot; id=&quot;mw-diff-left-l97&quot;&gt;Line 97:&lt;/td&gt;
&lt;td colspan=&quot;2&quot; class=&quot;diff-lineno&quot;&gt;Line 97:&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2407.05361 Original Research Paper]&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2407.05361 Original Research Paper]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2501.15907 Extended Research Paper]&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;* [https://arxiv.org/abs/2501.15907 Extended Research Paper]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-side-added&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;== See Also ==&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-side-added&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-side-added&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* [[F5-TTS]]&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-side-added&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* [[Text-to-Speech]]&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-side-added&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* [[Speech Generation]]&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-side-added&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* [[Multilingual Datasets]]&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-side-added&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot; data-marker=&quot;−&quot;&gt;&lt;/td&gt;&lt;td style=&quot;color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #ffe49c; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;&lt;del style=&quot;font-weight: bold; text-decoration: none;&quot;&gt;* [[Voice Cloning]]&lt;/del&gt;&lt;/div&gt;&lt;/td&gt;&lt;td colspan=&quot;2&quot; class=&quot;diff-side-added&quot;&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;br&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;br&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;[[Category:Datasets]]&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;[[Category:Datasets]]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;[[Category:Speech Datasets]]&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;[[Category:Speech Datasets]]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;
&lt;tr&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;[[Category:Open Source]]&lt;/div&gt;&lt;/td&gt;&lt;td class=&quot;diff-marker&quot;&gt;&lt;/td&gt;&lt;td style=&quot;background-color: #f8f9fa; color: #202122; font-size: 88%; border-style: solid; border-width: 1px 1px 1px 4px; border-radius: 0.33em; border-color: #eaecf0; vertical-align: top; white-space: pre-wrap;&quot;&gt;&lt;div&gt;[[Category:Open Source]]&lt;/div&gt;&lt;/td&gt;&lt;/tr&gt;

&lt;!-- diff cache key mediawiki:diff:1.41:old-6:rev-7:php=table --&gt;
&lt;/table&gt;</summary>
		<author><name>Ttswikiadmin</name></author>
	</entry>
	<entry>
		<id>https://tts.wiki/index.php?title=Emilia_Dataset&amp;diff=6&amp;oldid=prev</id>
		<title>Ttswikiadmin: Add Emilia dataset</title>
		<link rel="alternate" type="text/html" href="https://tts.wiki/index.php?title=Emilia_Dataset&amp;diff=6&amp;oldid=prev"/>
		<updated>2025-09-19T03:42:43Z</updated>

		<summary type="html">&lt;p&gt;Add Emilia dataset&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt;The &amp;#039;&amp;#039;&amp;#039;Emilia Dataset&amp;#039;&amp;#039;&amp;#039; is a large-scale, multilingual, and diverse speech generation dataset derived from in-the-wild speech data. Emilia starts with over 101k hours of speech across six languages, including a wide range of speaking styles for more natural and spontaneous speech generation.&lt;br /&gt;
&lt;br /&gt;
== Overview ==&lt;br /&gt;
&lt;br /&gt;
The Emilia dataset is constructed from a large collection of publicly-available audio on the Internet, such as podcasts, debates, and audiobooks. The dataset was created using &amp;#039;&amp;#039;&amp;#039;Emilia-Pipe&amp;#039;&amp;#039;&amp;#039;, an open-source preprocessing pipeline used to process, transcribe, and filter the dataset.&lt;br /&gt;
&lt;br /&gt;
== Dataset Statistics ==&lt;br /&gt;
&lt;br /&gt;
=== Original Emilia Dataset ===&lt;br /&gt;
{| class=&amp;quot;wikitable&amp;quot;&lt;br /&gt;
! Language !! Code !! Duration (Hours) &lt;br /&gt;
|-&lt;br /&gt;
| English || EN || 46.8k &lt;br /&gt;
|-&lt;br /&gt;
| Chinese || ZH || 49.9k &lt;br /&gt;
|-&lt;br /&gt;
| German || DE || 1.6k &lt;br /&gt;
|-&lt;br /&gt;
| French || FR || 1.4k &lt;br /&gt;
|-&lt;br /&gt;
| Japanese || JA || 1.7k &lt;br /&gt;
|-&lt;br /&gt;
| Korean || KO || 0.2k &lt;br /&gt;
|-&lt;br /&gt;
| &amp;#039;&amp;#039;&amp;#039;Total&amp;#039;&amp;#039;&amp;#039; || - || &amp;#039;&amp;#039;&amp;#039;101.7k&amp;#039;&amp;#039;&amp;#039;&lt;br /&gt;
|}&lt;br /&gt;
&lt;br /&gt;
=== Emilia-Large Dataset ===&lt;br /&gt;
The dataset has been expanded to Emilia-Large, a dataset with over 216k hours of speech, making it one of the largest openly-available speech datasets. Emilia-Large combines the original 101k-hour Emilia dataset (licensed under CC BY-NC 4.0) with the new Emilia-YODAS dataset (licensed under CC BY 4.0).&lt;br /&gt;
&lt;br /&gt;
The Emilia-YODAS dataset is based on the YODAS2 dataset, sourced from publicly-available YouTube videos licensed under the Creative Commons license.&lt;br /&gt;
&lt;br /&gt;
== Technical Specifications ==&lt;br /&gt;
&lt;br /&gt;
* &amp;#039;&amp;#039;&amp;#039;Sampling Rate&amp;#039;&amp;#039;&amp;#039;: 24 kHz&lt;br /&gt;
* &amp;#039;&amp;#039;&amp;#039;Audio Format&amp;#039;&amp;#039;&amp;#039;: WAV files, mono channel&lt;br /&gt;
* &amp;#039;&amp;#039;&amp;#039;Sample Width&amp;#039;&amp;#039;&amp;#039;: 16-bit&lt;br /&gt;
* &amp;#039;&amp;#039;&amp;#039;Audio Quality&amp;#039;&amp;#039;&amp;#039;: DNSMOS P.835 OVRL score of 2.50&lt;br /&gt;
* &amp;#039;&amp;#039;&amp;#039;Languages Supported&amp;#039;&amp;#039;&amp;#039;: 6 (English, Chinese, German, French, Japanese, Korean)&lt;br /&gt;
&lt;br /&gt;
== Emilia-Pipe Processing Pipeline ==&lt;br /&gt;
&lt;br /&gt;
Emilia-Pipe consists of six steps: Standardization, Source Separation, Speaker Diarization, Segmentation by VAD, ASR, and Filtering.&lt;br /&gt;
&lt;br /&gt;
=== Processing Steps ===&lt;br /&gt;
&lt;br /&gt;
==== 1. Standardization ====&lt;br /&gt;
Audio files are converted to WAV format, resampled to 24 kHz and set to mono-channel. Normalization is performed such that the amplitude levels range between -1 to 1, optimizing for a standard decibel level to minimize distortion.&lt;br /&gt;
&lt;br /&gt;
==== 2. Source Separation ====&lt;br /&gt;
This step involves the extraction of clean vocal tracks from audio that may contain background noise or music. The authors employ the Ultimate Vocal Remover model, which has been pretrained, to precisely isolate vocal elements.&lt;br /&gt;
&lt;br /&gt;
==== 3. Speaker Diarization ====&lt;br /&gt;
Speaker diarization techniques partition long-form speech data into multiple utterances based on the speaker using the PyAnnote speaker diarization 3.1 pipeline.&lt;br /&gt;
&lt;br /&gt;
==== 4. Segmentation (VAD) ====&lt;br /&gt;
Voice Activity Detection is used to further segment the audio into smaller, manageable chunks suitable for training.&lt;br /&gt;
&lt;br /&gt;
==== 5. Automated Speech Recognition (ASR) ====&lt;br /&gt;
ASR techniques transcribe the segmented speech data. The medium version of the Whisper model is employed, with batched inference for parallel processing.&lt;br /&gt;
&lt;br /&gt;
==== 6. Filtering ====&lt;br /&gt;
Segments not matching predetermined quality standards (e.g., DNSMOS score) or confidence indicators regarding language differentiation are eliminated, yielding a refined dataset.&lt;br /&gt;
&lt;br /&gt;
== Licensing and Access ==&lt;br /&gt;
&lt;br /&gt;
* &amp;#039;&amp;#039;&amp;#039;Emilia Dataset&amp;#039;&amp;#039;&amp;#039;: CC BY-NC 4.0 (Non-commercial use only)&lt;br /&gt;
* &amp;#039;&amp;#039;&amp;#039;Emilia-YODAS Dataset&amp;#039;&amp;#039;&amp;#039;: CC BY 4.0&lt;br /&gt;
* &amp;#039;&amp;#039;&amp;#039;Emilia-Pipe Pipeline&amp;#039;&amp;#039;&amp;#039;: Open-source&lt;br /&gt;
&lt;br /&gt;
Users are permitted to use Emilia dataset only for non-commercial purposes under the CC BY-NC-4.0 license. Emilia does not own the copyright to the audio files; the copyright remains with the original owners of the videos or audio.&lt;br /&gt;
&lt;br /&gt;
== Usage ==&lt;br /&gt;
&lt;br /&gt;
=== Loading the Dataset ===&lt;br /&gt;
&amp;lt;pre&amp;gt;&lt;br /&gt;
from datasets import load_dataset&lt;br /&gt;
dataset = load_dataset(&amp;quot;amphion/Emilia-Dataset&amp;quot;)&lt;br /&gt;
print(dataset)&lt;br /&gt;
&amp;lt;/pre&amp;gt;&lt;br /&gt;
&lt;br /&gt;
=== Loading Specific Languages ===&lt;br /&gt;
&amp;lt;pre&amp;gt;&lt;br /&gt;
from datasets import load_dataset&lt;br /&gt;
path = &amp;quot;Emilia/DE/*.tar&amp;quot;&lt;br /&gt;
dataset = load_dataset(&amp;quot;amphion/Emilia-Dataset&amp;quot;, &lt;br /&gt;
                      data_files={&amp;quot;de&amp;quot;: path}, &lt;br /&gt;
                      split=&amp;quot;de&amp;quot;, &lt;br /&gt;
                      streaming=True)&lt;br /&gt;
&amp;lt;/pre&amp;gt;&lt;br /&gt;
&lt;br /&gt;
== External Links ==&lt;br /&gt;
&lt;br /&gt;
* [https://huggingface.co/datasets/amphion/Emilia-Dataset Hugging Face Dataset Page]&lt;br /&gt;
* [https://emilia-dataset.github.io/Emilia-Demo-Page/ Demo Page]&lt;br /&gt;
* [https://github.com/open-mmlab/Amphion/tree/main/preprocessors/Emilia Emilia-Pipe Source Code]&lt;br /&gt;
* [https://arxiv.org/abs/2407.05361 Original Research Paper]&lt;br /&gt;
* [https://arxiv.org/abs/2501.15907 Extended Research Paper]&lt;br /&gt;
&lt;br /&gt;
== See Also ==&lt;br /&gt;
&lt;br /&gt;
* [[F5-TTS]]&lt;br /&gt;
* [[Text-to-Speech]]&lt;br /&gt;
* [[Speech Generation]]&lt;br /&gt;
* [[Multilingual Datasets]]&lt;br /&gt;
* [[Voice Cloning]]&lt;br /&gt;
&lt;br /&gt;
[[Category:Datasets]]&lt;br /&gt;
[[Category:Speech Datasets]]&lt;br /&gt;
[[Category:Open Source]]&lt;/div&gt;</summary>
		<author><name>Ttswikiadmin</name></author>
	</entry>
</feed>