""" 数据加载模块,用于读取并处理语音情感数据集 支持CASIA、SAVEE和RAVDESS三种数据集 """ import os import glob import numpy as np import pandas as pd import librosa import re from sklearn.model_selection import train_test_split import warnings # 定义常量 SAMPLE_RATE = 22050 # 统一采样率 MAX_DURATION = 5 # 最大音频长度(秒) MAX_SAMPLES = SAMPLE_RATE * MAX_DURATION # 最大样本数 # 情绪映射字典 EMOTION_MAPPING = { # CASIA 情感映射 'angry': 'angry', 'fear': 'fear', 'happy': 'happy', 'neutral': 'neutral', 'sad': 'sad', 'surprise': 'surprise', # SAVEE 情感映射 'a': 'angry', 'f': 'fear', 'h': 'happy', 'n': 'neutral', 'sa': 'sad', 'su': 'surprise', 'd': 'disgust', # 注意:SAVEE有厌恶情绪,但CASIA没有 # RAVDESS 情感映射 '01': 'neutral', '02': 'calm', # 注意:RAVDESS有平静情绪,但CASIA没有 '03': 'happy', '04': 'sad', '05': 'angry', '06': 'fear', '07': 'disgust', # 注意:RAVDESS有厌恶情绪,但CASIA没有 '08': 'surprise' } # 语言映射 LANGUAGE_MAPPING = { 'casia': 'zh', # 中文 'savee': 'en', # 英文 'ravdess': 'en' # 英文 } def load_casia(data_path): """ 加载CASIA中文情感语音数据集 Args: data_path: CASIA数据集路径 Returns: data_list: 包含(音频数据, 情感标签, 语言标签)的列表 """ data_list = [] # 确保路径存在 if not os.path.exists(data_path): print(f"警告: CASIA数据路径不存在: {data_path}") return data_list # 尝试获取演员列表 try: actors = os.listdir(data_path) except Exception as e: print(f"错误: 无法读取CASIA目录: {e}") return data_list success_count = 0 error_count = 0 for actor in actors: # 跳过隐藏文件或非目录 if actor.startswith('_') or not os.path.isdir(os.path.join(data_path, actor)): continue actor_path = os.path.join(data_path, actor) emotions = os.listdir(actor_path) for emotion in emotions: # 跳过隐藏文件或非目录 if emotion.startswith('_') or not os.path.isdir(os.path.join(actor_path, emotion)): continue emotion_path = os.path.join(actor_path, emotion) audio_files = glob.glob(os.path.join(emotion_path, "*.wav")) for audio_file in audio_files: # 读取音频文件 try: audio, sr = librosa.load(audio_file, sr=SAMPLE_RATE, res_type='kaiser_fast') # 统一音频长度 if len(audio) < MAX_SAMPLES: # 音频太短,用0填充 padding = MAX_SAMPLES - len(audio) audio = np.pad(audio, (0, padding), 'constant') else: # 音频太长,截断 audio = audio[:MAX_SAMPLES] data_list.append((audio, EMOTION_MAPPING[emotion], LANGUAGE_MAPPING['casia'])) success_count += 1 except Exception as e: error_count += 1 if error_count < 10: # 只显示前10个错误,避免日志过多 print(f"Error loading {audio_file}: {e}") elif error_count == 10: print("过多加载错误,后续错误将不再显示...") print(f"CASIA数据集: 成功加载 {success_count} 个文件,失败 {error_count} 个文件") return data_list def load_savee(data_path): """ 加载SAVEE英文情感语音数据集 Args: data_path: SAVEE数据集路径 Returns: data_list: 包含(音频数据, 情感标签, 语言标签)的列表 """ data_list = [] # 确保路径存在 if not os.path.exists(data_path): print(f"警告: SAVEE数据路径不存在: {data_path}") return data_list audio_path = os.path.join(data_path, "AudioData") if not os.path.exists(audio_path): print(f"警告: SAVEE AudioData路径不存在: {audio_path}") return data_list # SAVEE数据集中的四个说话者 actors = ['DC', 'JE', 'JK', 'KL'] success_count = 0 error_count = 0 for actor in actors: actor_path = os.path.join(audio_path, actor) if not os.path.isdir(actor_path): print(f"警告: SAVEE演员目录不存在: {actor_path}") continue audio_files = glob.glob(os.path.join(actor_path, "*.wav")) for audio_file in audio_files: file_name = os.path.basename(audio_file) # 提取情感标签,SAVEE使用文件名的前1-2个字母作为情感标签 if file_name.startswith("sa"): emotion = "sa" elif file_name.startswith("su"): emotion = "su" else: emotion = file_name[0] try: audio, sr = librosa.load(audio_file, sr=SAMPLE_RATE, res_type='kaiser_fast') # 统一音频长度 if len(audio) < MAX_SAMPLES: padding = MAX_SAMPLES - len(audio) audio = np.pad(audio, (0, padding), 'constant') else: audio = audio[:MAX_SAMPLES] data_list.append((audio, EMOTION_MAPPING[emotion], LANGUAGE_MAPPING['savee'])) success_count += 1 except Exception as e: error_count += 1 if error_count < 10: # 只显示前10个错误 print(f"Error loading {audio_file}: {e}") elif error_count == 10: print("过多加载错误,后续错误将不再显示...") print(f"SAVEE数据集: 成功加载 {success_count} 个文件,失败 {error_count} 个文件") return data_list def load_ravdess(data_path): """ 加载RAVDESS英文情感语音数据集 Args: data_path: RAVDESS数据集路径 Returns: data_list: 包含(音频数据, 情感标签, 语言标签)的列表 """ data_list = [] # 确保路径存在 if not os.path.exists(data_path): print(f"警告: RAVDESS数据路径不存在: {data_path}") return data_list # 获取所有演员目录 try: actor_dirs = glob.glob(os.path.join(data_path, "Actor_*")) except Exception as e: print(f"错误: 无法获取RAVDESS演员目录: {e}") return data_list if not actor_dirs: print(f"警告: RAVDESS演员目录为空: {data_path}") success_count = 0 error_count = 0 for actor_dir in actor_dirs: audio_files = glob.glob(os.path.join(actor_dir, "*.wav")) for audio_file in audio_files: file_name = os.path.basename(audio_file) # RAVDESS文件名格式: 03-01-05-01-02-01-12.wav # 05 表示情感类别 (angry) parts = file_name.split('-') if len(parts) >= 3: emotion = parts[2] try: audio, sr = librosa.load(audio_file, sr=SAMPLE_RATE, res_type='kaiser_fast') # 统一音频长度 if len(audio) < MAX_SAMPLES: padding = MAX_SAMPLES - len(audio) audio = np.pad(audio, (0, padding), 'constant') else: audio = audio[:MAX_SAMPLES] data_list.append((audio, EMOTION_MAPPING[emotion], LANGUAGE_MAPPING['ravdess'])) success_count += 1 except Exception as e: error_count += 1 if error_count < 10: # 只显示前10个错误 print(f"Error loading {audio_file}: {e}") elif error_count == 10: print("过多加载错误,后续错误将不再显示...") print(f"RAVDESS数据集: 成功加载 {success_count} 个文件,失败 {error_count} 个文件") return data_list def load_all_data(casia_path, savee_path, ravdess_path, selected_emotions=None): """ 加载所有数据集 Args: casia_path: CASIA数据集路径 savee_path: SAVEE数据集路径 ravdess_path: RAVDESS数据集路径 selected_emotions: 要使用的情感列表,如果为None,则使用所有共有的情感 Returns: X: 音频数据列表 y_emotion: 情感标签列表 y_language: 语言标签列表 """ print("加载CASIA数据集...") casia_data = load_casia(casia_path) print("加载SAVEE数据集...") savee_data = load_savee(savee_path) print("加载RAVDESS数据集...") ravdess_data = load_ravdess(ravdess_path) # 合并所有数据 all_data = casia_data + savee_data + ravdess_data # 检查是否有数据被加载 if not all_data: raise ValueError("没有成功加载任何数据!请检查数据路径和文件格式。") # 如果指定了要使用的情感列表,筛选数据 if selected_emotions: filtered_data = [item for item in all_data if item[1] in selected_emotions] if not filtered_data: print(f"警告: 筛选后没有匹配的情感数据。可用的情感标签: {set(item[1] for item in all_data)}") print(f"您请求的情感标签: {selected_emotions}") # 回退到使用所有数据 filtered_data = all_data all_data = filtered_data print(f"总共加载了 {len(all_data)} 个有效音频文件") # 显示各情感类别的数据分布 emotion_counts = {} for item in all_data: emotion = item[1] if emotion in emotion_counts: emotion_counts[emotion] += 1 else: emotion_counts[emotion] = 1 print("数据分布:") for emotion, count in emotion_counts.items(): print(f" {emotion}: {count} 个样本") # 分离数据、情感标签和语言标签 X = [item[0] for item in all_data] y_emotion = [item[1] for item in all_data] y_language = [item[2] for item in all_data] return X, y_emotion, y_language def prepare_data(X, y_emotion, y_language, test_size=0.2, val_size=0.2, random_state=42): """ 准备训练集、验证集和测试集 Args: X: 音频数据 y_emotion: 情感标签 y_language: 语言标签 test_size: 测试集比例 val_size: 验证集比例 random_state: 随机种子 Returns: 训练集、验证集和测试集数据和标签 """ # 确保数据不为空 if len(X) == 0: raise ValueError("数据集为空,无法进行划分!请确保至少加载了一些有效的音频文件。") # 先划分出测试集 X_train_val, X_test, y_emotion_train_val, y_emotion_test, y_language_train_val, y_language_test = train_test_split( X, y_emotion, y_language, test_size=test_size, random_state=random_state, stratify=y_emotion ) # 从剩余数据中划分训练集和验证集 val_ratio = val_size / (1 - test_size) X_train, X_val, y_emotion_train, y_emotion_val, y_language_train, y_language_val = train_test_split( X_train_val, y_emotion_train_val, y_language_train_val, test_size=val_ratio, random_state=random_state, stratify=y_emotion_train_val ) # 打印数据集大小 print(f"数据集划分: 训练集 {len(X_train)} 个样本, 验证集 {len(X_val)} 个样本, 测试集 {len(X_test)} 个样本") return ( X_train, y_emotion_train, y_language_train, X_val, y_emotion_val, y_language_val, X_test, y_emotion_test, y_language_test )