{
"cells": [
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": [
"# 多语言语音情感分析系统\n",
"\n",
"本Notebook按照题目要求,实现语音情感分析系统的各个部分:\n",
"1. 对多语言语音数据集进行预处理\n",
"2. 特征工程\n",
"3. 分类预测模型构建与分析\n"
]
},
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": [
"## 环境准备\n",
"\n",
"首先导入必要的库和设置环境\n"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"# 导入基础库\n",
"import os\n",
"import glob # 添加glob模块用于文件匹配\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import librosa\n",
"import librosa.display\n",
"from tqdm import tqdm\n",
"import pickle\n",
"import warnings\n",
"\n",
"# 机器学习库\n",
"from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
"from sklearn.model_selection import train_test_split\n",
"from tensorflow.keras.utils import to_categorical\n",
"import tensorflow as tf\n",
"\n",
"# 忽略警告\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"# 设置随机种子,确保结果可重复\n",
"np.random.seed(42)\n",
"tf.random.set_seed(42)\n",
"\n",
"# 设置图形样式\n",
"plt.style.use('ggplot')\n",
"plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文\n",
"plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号\n"
]
},
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": [
"# 1. 对多语言语音数据集进行预处理\n",
"\n",
"## 1.1 使用librosa库对数据进行初步读取与探索\n",
"\n",
"我们将加载音频文件,可视化波形,并进行基本的数据探索。\n"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"正在处理 3 个音频文件...\n",
"\n",
"音频特征DataFrame:\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" file_name | \n",
" dataset | \n",
" emotion | \n",
" duration | \n",
" pitch_mean | \n",
" pitch_std | \n",
" pitch_max | \n",
" pitch_min | \n",
" tuning_offset | \n",
" spectral_centroid_mean | \n",
" ... | \n",
" spectral_flatness_max | \n",
" spectral_flatness_min | \n",
" rms_mean | \n",
" rms_std | \n",
" rms_max | \n",
" rms_min | \n",
" zero_crossing_rate_mean | \n",
" zero_crossing_rate_std | \n",
" zero_crossing_rate_max | \n",
" zero_crossing_rate_min | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 03-01-01-01-01-01-01.wav | \n",
" RAVDESS | \n",
" neutral | \n",
" 5.0 | \n",
" 573.830444 | \n",
" 853.130066 | \n",
" 3988.073975 | \n",
" 147.432693 | \n",
" -0.17 | \n",
" 2086.724971 | \n",
" ... | \n",
" 1.000001 | \n",
" 7.847282e-05 | \n",
" 0.001493 | \n",
" 0.002882 | \n",
" 0.012591 | \n",
" 0.0 | \n",
" 0.149477 | \n",
" 0.170416 | \n",
" 0.528320 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 201.wav | \n",
" CASIA | \n",
" unknown | \n",
" 5.0 | \n",
" 815.843445 | \n",
" 1158.526733 | \n",
" 3824.677246 | \n",
" 171.179718 | \n",
" 0.10 | \n",
" 747.381791 | \n",
" ... | \n",
" 1.000001 | \n",
" 2.586936e-06 | \n",
" 0.019801 | \n",
" 0.039385 | \n",
" 0.146754 | \n",
" 0.0 | \n",
" 0.039092 | \n",
" 0.093996 | \n",
" 0.526855 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" a01.wav | \n",
" SAVEE | \n",
" angry | \n",
" 5.0 | \n",
" 523.944702 | \n",
" 591.257019 | \n",
" 3405.564209 | \n",
" 149.664169 | \n",
" -0.23 | \n",
" 669.513790 | \n",
" ... | \n",
" 1.000001 | \n",
" 8.574546e-08 | \n",
" 0.097924 | \n",
" 0.099660 | \n",
" 0.433721 | \n",
" 0.0 | \n",
" 0.020286 | \n",
" 0.032456 | \n",
" 0.202148 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
3 rows × 25 columns
\n",
"
"
],
"text/plain": [
" file_name dataset emotion duration pitch_mean \\\n",
"0 03-01-01-01-01-01-01.wav RAVDESS neutral 5.0 573.830444 \n",
"1 201.wav CASIA unknown 5.0 815.843445 \n",
"2 a01.wav SAVEE angry 5.0 523.944702 \n",
"\n",
" pitch_std pitch_max pitch_min tuning_offset \\\n",
"0 853.130066 3988.073975 147.432693 -0.17 \n",
"1 1158.526733 3824.677246 171.179718 0.10 \n",
"2 591.257019 3405.564209 149.664169 -0.23 \n",
"\n",
" spectral_centroid_mean ... spectral_flatness_max spectral_flatness_min \\\n",
"0 2086.724971 ... 1.000001 7.847282e-05 \n",
"1 747.381791 ... 1.000001 2.586936e-06 \n",
"2 669.513790 ... 1.000001 8.574546e-08 \n",
"\n",
" rms_mean rms_std rms_max rms_min zero_crossing_rate_mean \\\n",
"0 0.001493 0.002882 0.012591 0.0 0.149477 \n",
"1 0.019801 0.039385 0.146754 0.0 0.039092 \n",
"2 0.097924 0.099660 0.433721 0.0 0.020286 \n",
"\n",
" zero_crossing_rate_std zero_crossing_rate_max zero_crossing_rate_min \n",
"0 0.170416 0.528320 0.0 \n",
"1 0.093996 0.526855 0.0 \n",
"2 0.032456 0.202148 0.0 \n",
"\n",
"[3 rows x 25 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 定义常量\n",
"SAMPLE_RATE = 22050 # 统一采样率\n",
"MAX_DURATION = 5 # 最大音频长度(秒)\n",
"MAX_SAMPLES = SAMPLE_RATE * MAX_DURATION # 最大样本数\n",
"\n",
"# 加载音频文件\n",
"def load_audio(file_path):\n",
" \"\"\"\n",
" 加载音频文件\n",
" \n",
" Args:\n",
" file_path: 音频文件路径\n",
" \n",
" Returns:\n",
" audio: 音频数据\n",
" sr: 采样率\n",
" \"\"\"\n",
" # 加载音频\n",
" audio, sr = librosa.load(file_path, sr=SAMPLE_RATE, res_type='kaiser_fast')\n",
" \n",
" # 统一音频长度\n",
" if len(audio) < MAX_SAMPLES:\n",
" # 音频太短,用0填充\n",
" padding = MAX_SAMPLES - len(audio)\n",
" audio = np.pad(audio, (0, padding), 'constant')\n",
" else:\n",
" # 音频太长,截断\n",
" audio = audio[:MAX_SAMPLES]\n",
" \n",
" return audio, sr\n",
"\n",
"# 提取音高特征\n",
"def extract_pitch_features(audio, sr=SAMPLE_RATE):\n",
" \"\"\"\n",
" 提取音高特征\n",
" \n",
" Args:\n",
" audio: 音频信号\n",
" sr: 采样率\n",
" \n",
" Returns:\n",
" features: 音高特征字典\n",
" \"\"\"\n",
" features = {}\n",
" \n",
" # 使用piptrack提取音高\n",
" pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)\n",
" \n",
" # 提取每帧最大幅度对应的音高\n",
" pitches_mean = []\n",
" for t in range(pitches.shape[1]):\n",
" idx = np.argmax(magnitudes[:, t])\n",
" pitch = pitches[idx, t]\n",
" if pitch > 0: # 过滤掉静音帧\n",
" pitches_mean.append(pitch)\n",
" \n",
" # 计算音高特征\n",
" if pitches_mean: # 确保有有效的音高值\n",
" features['pitch_mean'] = np.mean(pitches_mean)\n",
" features['pitch_std'] = np.std(pitches_mean) if len(pitches_mean) > 1 else 0\n",
" features['pitch_max'] = np.max(pitches_mean)\n",
" features['pitch_min'] = np.min(pitches_mean) if len(pitches_mean) > 0 else 0\n",
" else:\n",
" features['pitch_mean'] = 0\n",
" features['pitch_std'] = 0\n",
" features['pitch_max'] = 0\n",
" features['pitch_min'] = 0\n",
" \n",
" return features\n",
"\n",
"# 提取调谐偏差特征\n",
"def extract_tuning_features(audio, sr=SAMPLE_RATE):\n",
" \"\"\"\n",
" 提取调谐偏差特征\n",
" \n",
" Args:\n",
" audio: 音频信号\n",
" sr: 采样率\n",
" \n",
" Returns:\n",
" features: 调谐特征字典\n",
" \"\"\"\n",
" features = {}\n",
" \n",
" # 提取调谐偏差\n",
" tuning_offset = librosa.estimate_tuning(y=audio, sr=sr)\n",
" features['tuning_offset'] = tuning_offset\n",
" \n",
" return features\n",
"\n",
"# 提取频谱质心特征\n",
"def extract_spectral_centroid_features(audio, sr=SAMPLE_RATE):\n",
" \"\"\"\n",
" 提取频谱质心特征\n",
" \n",
" Args:\n",
" audio: 音频信号\n",
" sr: 采样率\n",
" \n",
" Returns:\n",
" features: 频谱质心特征字典\n",
" \"\"\"\n",
" features = {}\n",
" \n",
" # 提取频谱质心\n",
" spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]\n",
" \n",
" # 计算统计特征\n",
" features['spectral_centroid_mean'] = np.mean(spectral_centroid)\n",
" features['spectral_centroid_std'] = np.std(spectral_centroid)\n",
" features['spectral_centroid_max'] = np.max(spectral_centroid)\n",
" features['spectral_centroid_min'] = np.min(spectral_centroid)\n",
" \n",
" return features\n",
"\n",
"# 提取频谱平坦度特征\n",
"def extract_spectral_flatness_features(audio, sr=SAMPLE_RATE):\n",
" \"\"\"\n",
" 提取频谱平坦度特征\n",
" \n",
" Args:\n",
" audio: 音频信号\n",
" sr: 采样率\n",
" \n",
" Returns:\n",
" features: 频谱平坦度特征字典\n",
" \"\"\"\n",
" features = {}\n",
" \n",
" # 提取频谱平坦度\n",
" spectral_flatness = librosa.feature.spectral_flatness(y=audio)[0]\n",
" \n",
" # 计算统计特征\n",
" features['spectral_flatness_mean'] = np.mean(spectral_flatness)\n",
" features['spectral_flatness_std'] = np.std(spectral_flatness)\n",
" features['spectral_flatness_max'] = np.max(spectral_flatness)\n",
" features['spectral_flatness_min'] = np.min(spectral_flatness)\n",
" \n",
" return features\n",
"\n",
"# 提取均方根能量特征\n",
"def extract_rms_features(audio, sr=SAMPLE_RATE):\n",
" \"\"\"\n",
" 提取均方根能量(RMS)特征\n",
" \n",
" Args:\n",
" audio: 音频信号\n",
" sr: 采样率\n",
" \n",
" Returns:\n",
" features: RMS特征字典\n",
" \"\"\"\n",
" features = {}\n",
" \n",
" # 提取RMS\n",
" rms = librosa.feature.rms(y=audio)[0]\n",
" \n",
" # 计算统计特征\n",
" features['rms_mean'] = np.mean(rms)\n",
" features['rms_std'] = np.std(rms)\n",
" features['rms_max'] = np.max(rms)\n",
" features['rms_min'] = np.min(rms)\n",
" \n",
" # 添加零交叉率(ZCR)\n",
" zcr = librosa.feature.zero_crossing_rate(audio)[0]\n",
" features['zero_crossing_rate_mean'] = np.mean(zcr)\n",
" features['zero_crossing_rate_std'] = np.std(zcr)\n",
" features['zero_crossing_rate_max'] = np.max(zcr)\n",
" features['zero_crossing_rate_min'] = np.min(zcr)\n",
" \n",
" return features\n",
"\n",
"# 提取所有特征并创建DataFrame\n",
"def extract_all_features(audio, sr=SAMPLE_RATE):\n",
" \"\"\"\n",
" 从音频中提取所有特征\n",
" \n",
" Args:\n",
" audio: 音频信号\n",
" sr: 采样率\n",
" \n",
" Returns:\n",
" features: 特征字典\n",
" \"\"\"\n",
" features = {}\n",
" \n",
" # 提取各类特征\n",
" pitch_features = extract_pitch_features(audio, sr)\n",
" tuning_features = extract_tuning_features(audio, sr)\n",
" centroid_features = extract_spectral_centroid_features(audio, sr)\n",
" flatness_features = extract_spectral_flatness_features(audio, sr)\n",
" rms_features = extract_rms_features(audio, sr)\n",
" \n",
" # 合并所有特征\n",
" features.update(pitch_features)\n",
" features.update(tuning_features)\n",
" features.update(centroid_features)\n",
" features.update(flatness_features)\n",
" features.update(rms_features)\n",
" \n",
" return features\n",
"\n",
"# 加载并提取所有音频文件的特征\n",
"def process_audio_files(file_paths):\n",
" \"\"\"\n",
" 处理多个音频文件并返回包含所有特征的DataFrame\n",
" \n",
" Args:\n",
" file_paths: 音频文件路径列表\n",
" \n",
" Returns:\n",
" df: 包含所有特征的DataFrame\n",
" \"\"\"\n",
" features_list = []\n",
" \n",
" for file_path in file_paths:\n",
" # 提取文件信息\n",
" file_name = os.path.basename(file_path)\n",
" \n",
" # 确定情感类别(从文件路径推断)\n",
" emotion = \"unknown\"\n",
" dataset = \"unknown\"\n",
" \n",
" if \"RAVDESS\" in file_path:\n",
" dataset = \"RAVDESS\"\n",
" parts = file_name.split('-')\n",
" if len(parts) >= 3:\n",
" emotion_code = parts[2]\n",
" emotion_map = {'01': 'neutral', '03': 'happy', '04': 'sad', \n",
" '05': 'angry', '06': 'fear', '08': 'surprise'}\n",
" if emotion_code in emotion_map:\n",
" emotion = emotion_map[emotion_code]\n",
" elif \"CAISA\" in file_path:\n",
" dataset = \"CASIA\"\n",
" parts = file_path.split(os.sep)\n",
" if len(parts) >= 3:\n",
" emotion = parts[-2] # 假设倒数第二个目录是情感名称\n",
" elif \"SAVEE\" in file_path:\n",
" dataset = \"SAVEE\"\n",
" if file_name.startswith(\"sa\"):\n",
" emotion = \"sad\"\n",
" elif file_name.startswith(\"su\"):\n",
" emotion = \"surprise\"\n",
" elif file_name.startswith(\"a\"):\n",
" emotion = \"angry\"\n",
" elif file_name.startswith(\"f\"):\n",
" emotion = \"fear\"\n",
" elif file_name.startswith(\"h\"):\n",
" emotion = \"happy\"\n",
" elif file_name.startswith(\"n\"):\n",
" emotion = \"neutral\"\n",
" \n",
" try:\n",
" # 加载音频\n",
" audio, sr = load_audio(file_path)\n",
" \n",
" # 提取特征\n",
" features = extract_all_features(audio, sr)\n",
" \n",
" # 添加文件信息\n",
" features['file_name'] = file_name\n",
" features['dataset'] = dataset\n",
" features['emotion'] = emotion\n",
" features['duration'] = len(audio) / sr\n",
" \n",
" features_list.append(features)\n",
" except Exception as e:\n",
" print(f\"处理文件 {file_path} 时出错: {e}\")\n",
" \n",
" # 创建DataFrame\n",
" if features_list:\n",
" import pandas as pd\n",
" df = pd.DataFrame(features_list)\n",
" \n",
" # 调整列顺序,将文件信息放在前面\n",
" cols = ['file_name', 'dataset', 'emotion', 'duration'] + [col for col in df.columns if col not in ['file_name', 'dataset', 'emotion', 'duration']]\n",
" df = df[cols]\n",
" \n",
" return df\n",
" else:\n",
" print(\"没有成功处理任何文件\")\n",
" return None\n",
"\n",
"# 收集所有音频文件路径\n",
"audio_files = []\n",
"\n",
"# 添加RAVDESS数据集音频\n",
"ravdess_file = './RAVDESS/Actor_01/03-01-01-01-01-01-01.wav'\n",
"if os.path.exists(ravdess_file):\n",
" audio_files.append(ravdess_file)\n",
"\n",
"# 添加CASIA数据集音频\n",
"casia_file = './CAISA/liuchanhg/angry/201.wav'\n",
"if os.path.exists(casia_file):\n",
" audio_files.append(casia_file)\n",
"\n",
"# 添加SAVEE数据集音频\n",
"savee_file = './SAVEE/AudioData/DC/a01.wav'\n",
"if os.path.exists(savee_file):\n",
" audio_files.append(savee_file)\n",
"\n",
"# 处理音频文件并创建DataFrame\n",
"if audio_files:\n",
" print(f\"正在处理 {len(audio_files)} 个音频文件...\")\n",
" features_df = process_audio_files(audio_files)\n",
" \n",
" # 显示DataFrame\n",
" if features_df is not None:\n",
" print(\"\\n音频特征DataFrame:\")\n",
" display(features_df)\n",
"else:\n",
" print(\"未找到任何音频文件\")\n"
]
},
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": [
"# 2. 特征工程\n",
"\n",
"在这一部分,我们将从音频信号中提取各种声学特征,这些特征对于情感识别非常重要。\n"
]
},
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": [
"## 2.1 构建音高、倾斜调谐偏差指标的代码\n",
"在这一节中,我们将实现从音频中提取音高和调谐偏差指标的功能。这些特征可以帮助我们捕捉说话者的语调变化,对情感识别有重要意义。\n"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"# 导入必要的库\n",
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import librosa\n",
"import librosa.display\n",
"\n",
"# 定义常量\n",
"SAMPLE_RATE = 22050 # 统一采样率\n",
"MAX_DURATION = 5 # 最大音频长度(秒)\n",
"MAX_SAMPLES = SAMPLE_RATE * MAX_DURATION # 最大样本数\n",
"\n",
"# 加载音频并提取特征的函数\n",
"def extract_audio_features(file_path):\n",
" \"\"\"\n",
" 加载音频文件并提取音频特征到DataFrame\n",
" \n",
" Args:\n",
" file_path: 音频文件路径\n",
" \n",
" Returns:\n",
" features_df: 包含音频特征的DataFrame\n",
" audio: 音频数据\n",
" sr: 采样率\n",
" \"\"\"\n",
" # 加载音频\n",
" audio, sr = librosa.load(file_path, sr=SAMPLE_RATE, res_type='kaiser_fast')\n",
" \n",
" # 统一音频长度\n",
" if len(audio) < MAX_SAMPLES:\n",
" # 音频太短,用0填充\n",
" padding = MAX_SAMPLES - len(audio)\n",
" audio = np.pad(audio, (0, padding), 'constant')\n",
" else:\n",
" # 音频太长,截断\n",
" audio = audio[:MAX_SAMPLES]\n",
" \n",
" # 提取音高特征\n",
" pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)\n",
" \n",
" # 提取每帧最大幅度对应的音高\n",
" pitch_values = []\n",
" for t in range(pitches.shape[1]):\n",
" idx = np.argmax(magnitudes[:, t])\n",
" pitch = pitches[idx, t]\n",
" if pitch > 0: # 过滤掉静音帧\n",
" pitch_values.append(pitch)\n",
" \n",
" # 提取调谐偏差\n",
" tuning_offset = librosa.estimate_tuning(y=audio, sr=sr)\n",
" \n",
" # 提取频谱质心\n",
" spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]\n",
" \n",
" # 提取频谱平坦度\n",
" spectral_flatness = librosa.feature.spectral_flatness(y=audio)[0]\n",
" \n",
" # 提取零交叉率\n",
" zero_crossing_rate = librosa.feature.zero_crossing_rate(audio)[0]\n",
" \n",
" # 提取RMS能量\n",
" rms = librosa.feature.rms(y=audio)[0]\n",
" \n",
" # 提取MFCC\n",
" mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)\n",
" \n",
" # 创建特征DataFrame\n",
" features = {\n",
" 'pitch_mean': np.mean(pitch_values) if pitch_values else 0,\n",
" 'pitch_std': np.std(pitch_values) if len(pitch_values) > 1 else 0,\n",
" 'pitch_max': np.max(pitch_values) if pitch_values else 0,\n",
" 'pitch_min': np.min(pitch_values) if pitch_values else 0,\n",
" 'tuning_offset': tuning_offset,\n",
" 'spectral_centroid_mean': np.mean(spectral_centroid),\n",
" 'spectral_centroid_std': np.std(spectral_centroid),\n",
" 'spectral_centroid_max': np.max(spectral_centroid),\n",
" 'spectral_centroid_min': np.min(spectral_centroid),\n",
" 'spectral_flatness_mean': np.mean(spectral_flatness),\n",
" 'spectral_flatness_std': np.std(spectral_flatness),\n",
" 'spectral_flatness_max': np.max(spectral_flatness),\n",
" 'spectral_flatness_min': np.min(spectral_flatness),\n",
" 'zero_crossing_rate_mean': np.mean(zero_crossing_rate),\n",
" 'zero_crossing_rate_std': np.std(zero_crossing_rate),\n",
" 'zero_crossing_rate_max': np.max(zero_crossing_rate),\n",
" 'zero_crossing_rate_min': np.min(zero_crossing_rate),\n",
" 'rms_mean': np.mean(rms),\n",
" 'rms_std': np.std(rms),\n",
" 'rms_max': np.max(rms),\n",
" 'rms_min': np.min(rms),\n",
" }\n",
" \n",
" # 添加MFCC特征\n",
" for i in range(13):\n",
" features[f'mfcc_{i+1}_mean'] = np.mean(mfccs[i])\n",
" features[f'mfcc_{i+1}_std'] = np.std(mfccs[i])\n",
" \n",
" # 转换为DataFrame\n",
" features_df = pd.DataFrame([features])\n",
" \n",
" return features_df, audio, sr\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"处理文件: ./RAVDESS/Actor_01/03-01-01-01-01-01-01.wav\n",
"处理文件: ./CAISA/liuchanhg/angry/201.wav\n",
"处理文件: ./SAVEE/AudioData/DC/a01.wav\n",
"\n",
"音频特征DataFrame:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" file_name | \n",
" dataset | \n",
" emotion | \n",
" pitch_mean | \n",
" pitch_std | \n",
" pitch_max | \n",
" pitch_min | \n",
" tuning_offset | \n",
" spectral_centroid_mean | \n",
" spectral_centroid_std | \n",
" ... | \n",
" mfcc_9_mean | \n",
" mfcc_9_std | \n",
" mfcc_10_mean | \n",
" mfcc_10_std | \n",
" mfcc_11_mean | \n",
" mfcc_11_std | \n",
" mfcc_12_mean | \n",
" mfcc_12_std | \n",
" mfcc_13_mean | \n",
" mfcc_13_std | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 03-01-01-01-01-01-01.wav | \n",
" RAVDESS | \n",
" neutral | \n",
" 573.830444 | \n",
" 853.130066 | \n",
" 3988.073975 | \n",
" 147.432693 | \n",
" -0.17 | \n",
" 2086.724971 | \n",
" 1955.973249 | \n",
" ... | \n",
" -9.014230 | \n",
" 17.056114 | \n",
" -0.240036 | \n",
" 6.110453 | \n",
" -0.998587 | \n",
" 7.636956 | \n",
" -0.585506 | \n",
" 7.082282 | \n",
" -0.012864 | \n",
" 8.772419 | \n",
"
\n",
" \n",
" | 1 | \n",
" 201.wav | \n",
" CASIA | \n",
" unknown | \n",
" 815.843445 | \n",
" 1158.526733 | \n",
" 3824.677246 | \n",
" 171.179718 | \n",
" 0.10 | \n",
" 747.381791 | \n",
" 1237.797995 | \n",
" ... | \n",
" -2.926738 | \n",
" 12.263833 | \n",
" -2.856271 | \n",
" 9.881656 | \n",
" 0.429380 | \n",
" 6.724971 | \n",
" -3.176131 | \n",
" 9.400505 | \n",
" 1.080766 | \n",
" 7.199394 | \n",
"
\n",
" \n",
" | 2 | \n",
" a01.wav | \n",
" SAVEE | \n",
" angry | \n",
" 523.944702 | \n",
" 591.257019 | \n",
" 3405.564209 | \n",
" 149.664169 | \n",
" -0.23 | \n",
" 669.513790 | \n",
" 916.785300 | \n",
" ... | \n",
" -0.700413 | \n",
" 10.004570 | \n",
" -8.432466 | \n",
" 15.594891 | \n",
" -1.799932 | \n",
" 11.306618 | \n",
" -0.687864 | \n",
" 8.437500 | \n",
" -0.647611 | \n",
" 8.446262 | \n",
"
\n",
" \n",
"
\n",
"
3 rows × 50 columns
\n",
"
"
],
"text/plain": [
" file_name dataset emotion pitch_mean pitch_std \\\n",
"0 03-01-01-01-01-01-01.wav RAVDESS neutral 573.830444 853.130066 \n",
"1 201.wav CASIA unknown 815.843445 1158.526733 \n",
"2 a01.wav SAVEE angry 523.944702 591.257019 \n",
"\n",
" pitch_max pitch_min tuning_offset spectral_centroid_mean \\\n",
"0 3988.073975 147.432693 -0.17 2086.724971 \n",
"1 3824.677246 171.179718 0.10 747.381791 \n",
"2 3405.564209 149.664169 -0.23 669.513790 \n",
"\n",
" spectral_centroid_std ... mfcc_9_mean mfcc_9_std mfcc_10_mean \\\n",
"0 1955.973249 ... -9.014230 17.056114 -0.240036 \n",
"1 1237.797995 ... -2.926738 12.263833 -2.856271 \n",
"2 916.785300 ... -0.700413 10.004570 -8.432466 \n",
"\n",
" mfcc_10_std mfcc_11_mean mfcc_11_std mfcc_12_mean mfcc_12_std \\\n",
"0 6.110453 -0.998587 7.636956 -0.585506 7.082282 \n",
"1 9.881656 0.429380 6.724971 -3.176131 9.400505 \n",
"2 15.594891 -1.799932 11.306618 -0.687864 8.437500 \n",
"\n",
" mfcc_13_mean mfcc_13_std \n",
"0 -0.012864 8.772419 \n",
"1 1.080766 7.199394 \n",
"2 -0.647611 8.446262 \n",
"\n",
"[3 rows x 50 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"特征统计信息:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pitch_mean | \n",
" pitch_std | \n",
" pitch_max | \n",
" pitch_min | \n",
" tuning_offset | \n",
" spectral_centroid_mean | \n",
" spectral_centroid_std | \n",
" spectral_centroid_max | \n",
" spectral_centroid_min | \n",
" spectral_flatness_mean | \n",
" ... | \n",
" mfcc_9_mean | \n",
" mfcc_9_std | \n",
" mfcc_10_mean | \n",
" mfcc_10_std | \n",
" mfcc_11_mean | \n",
" mfcc_11_std | \n",
" mfcc_12_mean | \n",
" mfcc_12_std | \n",
" mfcc_13_mean | \n",
" mfcc_13_std | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.0 | \n",
" 3.000000 | \n",
" ... | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
"
\n",
" \n",
" | mean | \n",
" 637.872864 | \n",
" 867.637939 | \n",
" 3739.438477 | \n",
" 156.092209 | \n",
" -0.100000 | \n",
" 1167.873517 | \n",
" 1370.185515 | \n",
" 5290.721691 | \n",
" 0.0 | \n",
" 0.450286 | \n",
" ... | \n",
" -4.213793 | \n",
" 13.108172 | \n",
" -3.842924 | \n",
" 10.529000 | \n",
" -0.789713 | \n",
" 8.556182 | \n",
" -1.483167 | \n",
" 8.306762 | \n",
" 0.140097 | \n",
" 8.139359 | \n",
"
\n",
" \n",
" | std | \n",
" 156.132294 | \n",
" 283.912994 | \n",
" 300.464050 | \n",
" 13.113729 | \n",
" 0.175784 | \n",
" 796.700603 | \n",
" 532.092792 | \n",
" 1190.637435 | \n",
" 0.0 | \n",
" 0.185900 | \n",
" ... | \n",
" 4.303751 | \n",
" 3.600799 | \n",
" 4.184387 | \n",
" 4.775241 | \n",
" 1.129238 | \n",
" 2.425202 | \n",
" 1.467043 | \n",
" 1.164628 | \n",
" 0.874282 | \n",
" 0.830208 | \n",
"
\n",
" \n",
" | min | \n",
" 523.944702 | \n",
" 591.257019 | \n",
" 3405.564209 | \n",
" 147.432693 | \n",
" -0.230000 | \n",
" 669.513790 | \n",
" 916.785300 | \n",
" 4111.711753 | \n",
" 0.0 | \n",
" 0.259868 | \n",
" ... | \n",
" -9.014230 | \n",
" 10.004570 | \n",
" -8.432466 | \n",
" 6.110453 | \n",
" -1.799932 | \n",
" 6.724971 | \n",
" -3.176131 | \n",
" 7.082282 | \n",
" -0.647611 | \n",
" 7.199394 | \n",
"
\n",
" \n",
" | 25% | \n",
" 548.887573 | \n",
" 722.193542 | \n",
" 3615.120728 | \n",
" 148.548431 | \n",
" -0.200000 | \n",
" 708.447791 | \n",
" 1077.291648 | \n",
" 4689.754748 | \n",
" 0.0 | \n",
" 0.359773 | \n",
" ... | \n",
" -5.970484 | \n",
" 11.134202 | \n",
" -5.644368 | \n",
" 7.996054 | \n",
" -1.399260 | \n",
" 7.180964 | \n",
" -1.931997 | \n",
" 7.759891 | \n",
" -0.330237 | \n",
" 7.822828 | \n",
"
\n",
" \n",
" | 50% | \n",
" 573.830444 | \n",
" 853.130066 | \n",
" 3824.677246 | \n",
" 149.664169 | \n",
" -0.170000 | \n",
" 747.381791 | \n",
" 1237.797995 | \n",
" 5267.797743 | \n",
" 0.0 | \n",
" 0.459678 | \n",
" ... | \n",
" -2.926738 | \n",
" 12.263833 | \n",
" -2.856271 | \n",
" 9.881656 | \n",
" -0.998587 | \n",
" 7.636956 | \n",
" -0.687864 | \n",
" 8.437500 | \n",
" -0.012864 | \n",
" 8.446262 | \n",
"
\n",
" \n",
" | 75% | \n",
" 694.836945 | \n",
" 1005.828400 | \n",
" 3906.375610 | \n",
" 160.421944 | \n",
" -0.035000 | \n",
" 1417.053381 | \n",
" 1596.885622 | \n",
" 5880.226659 | \n",
" 0.0 | \n",
" 0.545495 | \n",
" ... | \n",
" -1.813576 | \n",
" 14.659974 | \n",
" -1.548153 | \n",
" 12.738273 | \n",
" -0.284604 | \n",
" 9.471787 | \n",
" -0.636685 | \n",
" 8.919003 | \n",
" 0.533951 | \n",
" 8.609341 | \n",
"
\n",
" \n",
" | max | \n",
" 815.843445 | \n",
" 1158.526733 | \n",
" 3988.073975 | \n",
" 171.179718 | \n",
" 0.100000 | \n",
" 2086.724971 | \n",
" 1955.973249 | \n",
" 6492.655576 | \n",
" 0.0 | \n",
" 0.631312 | \n",
" ... | \n",
" -0.700413 | \n",
" 17.056114 | \n",
" -0.240036 | \n",
" 15.594891 | \n",
" 0.429380 | \n",
" 11.306618 | \n",
" -0.585506 | \n",
" 9.400505 | \n",
" 1.080766 | \n",
" 8.772419 | \n",
"
\n",
" \n",
"
\n",
"
8 rows × 47 columns
\n",
"
"
],
"text/plain": [
" pitch_mean pitch_std pitch_max pitch_min tuning_offset \\\n",
"count 3.000000 3.000000 3.000000 3.000000 3.000000 \n",
"mean 637.872864 867.637939 3739.438477 156.092209 -0.100000 \n",
"std 156.132294 283.912994 300.464050 13.113729 0.175784 \n",
"min 523.944702 591.257019 3405.564209 147.432693 -0.230000 \n",
"25% 548.887573 722.193542 3615.120728 148.548431 -0.200000 \n",
"50% 573.830444 853.130066 3824.677246 149.664169 -0.170000 \n",
"75% 694.836945 1005.828400 3906.375610 160.421944 -0.035000 \n",
"max 815.843445 1158.526733 3988.073975 171.179718 0.100000 \n",
"\n",
" spectral_centroid_mean spectral_centroid_std spectral_centroid_max \\\n",
"count 3.000000 3.000000 3.000000 \n",
"mean 1167.873517 1370.185515 5290.721691 \n",
"std 796.700603 532.092792 1190.637435 \n",
"min 669.513790 916.785300 4111.711753 \n",
"25% 708.447791 1077.291648 4689.754748 \n",
"50% 747.381791 1237.797995 5267.797743 \n",
"75% 1417.053381 1596.885622 5880.226659 \n",
"max 2086.724971 1955.973249 6492.655576 \n",
"\n",
" spectral_centroid_min spectral_flatness_mean ... mfcc_9_mean \\\n",
"count 3.0 3.000000 ... 3.000000 \n",
"mean 0.0 0.450286 ... -4.213793 \n",
"std 0.0 0.185900 ... 4.303751 \n",
"min 0.0 0.259868 ... -9.014230 \n",
"25% 0.0 0.359773 ... -5.970484 \n",
"50% 0.0 0.459678 ... -2.926738 \n",
"75% 0.0 0.545495 ... -1.813576 \n",
"max 0.0 0.631312 ... -0.700413 \n",
"\n",
" mfcc_9_std mfcc_10_mean mfcc_10_std mfcc_11_mean mfcc_11_std \\\n",
"count 3.000000 3.000000 3.000000 3.000000 3.000000 \n",
"mean 13.108172 -3.842924 10.529000 -0.789713 8.556182 \n",
"std 3.600799 4.184387 4.775241 1.129238 2.425202 \n",
"min 10.004570 -8.432466 6.110453 -1.799932 6.724971 \n",
"25% 11.134202 -5.644368 7.996054 -1.399260 7.180964 \n",
"50% 12.263833 -2.856271 9.881656 -0.998587 7.636956 \n",
"75% 14.659974 -1.548153 12.738273 -0.284604 9.471787 \n",
"max 17.056114 -0.240036 15.594891 0.429380 11.306618 \n",
"\n",
" mfcc_12_mean mfcc_12_std mfcc_13_mean mfcc_13_std \n",
"count 3.000000 3.000000 3.000000 3.000000 \n",
"mean -1.483167 8.306762 0.140097 8.139359 \n",
"std 1.467043 1.164628 0.874282 0.830208 \n",
"min -3.176131 7.082282 -0.647611 7.199394 \n",
"25% -1.931997 7.759891 -0.330237 7.822828 \n",
"50% -0.687864 8.437500 -0.012864 8.446262 \n",
"75% -0.636685 8.919003 0.533951 8.609341 \n",
"max -0.585506 9.400505 1.080766 8.772419 \n",
"\n",
"[8 rows x 47 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 使用提取特征函数测试几个音频文件\n",
"# 收集音频文件路径\n",
"audio_files = []\n",
"\n",
"# 添加RAVDESS数据集音频\n",
"ravdess_file = './RAVDESS/Actor_01/03-01-01-01-01-01-01.wav'\n",
"if os.path.exists(ravdess_file):\n",
" audio_files.append(ravdess_file)\n",
"\n",
"# 添加CASIA数据集音频\n",
"casia_file = './CAISA/liuchanhg/angry/201.wav'\n",
"if os.path.exists(casia_file):\n",
" audio_files.append(casia_file)\n",
"\n",
"# 添加SAVEE数据集音频\n",
"savee_file = './SAVEE/AudioData/DC/a01.wav'\n",
"if os.path.exists(savee_file):\n",
" audio_files.append(savee_file)\n",
"\n",
"# 存储提取的特征\n",
"all_features_df = pd.DataFrame()\n",
"\n",
"# 处理每个音频文件并提取特征\n",
"for file_path in audio_files:\n",
" print(f\"处理文件: {file_path}\")\n",
" \n",
" # 提取文件信息\n",
" file_name = os.path.basename(file_path)\n",
" \n",
" # 提取特征\n",
" features_df, audio, sr = extract_audio_features(file_path)\n",
" \n",
" # 添加文件信息\n",
" features_df['file_name'] = file_name\n",
" \n",
" # 确定情感类别(从文件路径推断)\n",
" emotion = \"unknown\"\n",
" dataset = \"unknown\"\n",
" \n",
" if \"RAVDESS\" in file_path:\n",
" dataset = \"RAVDESS\"\n",
" parts = file_name.split('-')\n",
" if len(parts) >= 3:\n",
" emotion_code = parts[2]\n",
" emotion_map = {'01': 'neutral', '03': 'happy', '04': 'sad', \n",
" '05': 'angry', '06': 'fear', '08': 'surprise'}\n",
" if emotion_code in emotion_map:\n",
" emotion = emotion_map[emotion_code]\n",
" elif \"CAISA\" in file_path:\n",
" dataset = \"CASIA\"\n",
" parts = file_path.split(os.sep)\n",
" if len(parts) >= 3:\n",
" emotion = parts[-2] # 假设倒数第二个目录是情感名称\n",
" elif \"SAVEE\" in file_path:\n",
" dataset = \"SAVEE\"\n",
" if file_name.startswith(\"sa\"):\n",
" emotion = \"sad\"\n",
" elif file_name.startswith(\"su\"):\n",
" emotion = \"surprise\"\n",
" elif file_name.startswith(\"a\"):\n",
" emotion = \"angry\"\n",
" elif file_name.startswith(\"f\"):\n",
" emotion = \"fear\"\n",
" elif file_name.startswith(\"h\"):\n",
" emotion = \"happy\"\n",
" elif file_name.startswith(\"n\"):\n",
" emotion = \"neutral\"\n",
" \n",
" features_df['dataset'] = dataset\n",
" features_df['emotion'] = emotion\n",
" \n",
" # 将特征添加到总DataFrame\n",
" all_features_df = pd.concat([all_features_df, features_df], ignore_index=True)\n",
"\n",
"# 重新排列列,将文件信息放在前面\n",
"if not all_features_df.empty:\n",
" info_cols = ['file_name', 'dataset', 'emotion']\n",
" feature_cols = [col for col in all_features_df.columns if col not in info_cols]\n",
" all_features_df = all_features_df[info_cols + feature_cols]\n",
" \n",
" # 显示提取的音频特征\n",
" print(\"\\n音频特征DataFrame:\")\n",
" display(all_features_df)\n",
" \n",
" # 显示特征统计信息\n",
" print(\"\\n特征统计信息:\")\n",
" display(all_features_df.describe())\n",
"else:\n",
" print(\"未找到任何音频文件或处理过程中出错\")\n"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"使用RAVDESS音频文件进行测试: ./RAVDESS/Actor_01/03-01-01-01-01-01-01.wav\n",
"\n",
"音高帧级数据 (前10行):\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" frame | \n",
" time | \n",
" pitch | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0.00000 | \n",
" 2458.481445 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 0.02322 | \n",
" 182.268219 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" 0.04644 | \n",
" 190.799026 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3 | \n",
" 0.06966 | \n",
" 197.138947 | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
" 0.09288 | \n",
" 423.822876 | \n",
"
\n",
" \n",
" | 5 | \n",
" 5 | \n",
" 0.11610 | \n",
" 166.867813 | \n",
"
\n",
" \n",
" | 6 | \n",
" 6 | \n",
" 0.13932 | \n",
" 172.440811 | \n",
"
\n",
" \n",
" | 7 | \n",
" 7 | \n",
" 0.16254 | \n",
" 168.360764 | \n",
"
\n",
" \n",
" | 8 | \n",
" 8 | \n",
" 0.18576 | \n",
" 148.656494 | \n",
"
\n",
" \n",
" | 9 | \n",
" 9 | \n",
" 0.20898 | \n",
" 151.040848 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" frame time pitch\n",
"0 0 0.00000 2458.481445\n",
"1 1 0.02322 182.268219\n",
"2 2 0.04644 190.799026\n",
"3 3 0.06966 197.138947\n",
"4 4 0.09288 423.822876\n",
"5 5 0.11610 166.867813\n",
"6 6 0.13932 172.440811\n",
"7 7 0.16254 168.360764\n",
"8 8 0.18576 148.656494\n",
"9 9 0.20898 151.040848"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"音高和调谐统计特征:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pitch_mean | \n",
" pitch_std | \n",
" pitch_max | \n",
" pitch_min | \n",
" pitch_range | \n",
" pitch_median | \n",
" tuning_offset | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 573.830444 | \n",
" 853.130066 | \n",
" 3988.073975 | \n",
" 147.432693 | \n",
" 3840.641357 | \n",
" 234.112061 | \n",
" -0.17 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pitch_mean pitch_std pitch_max pitch_min pitch_range pitch_median \\\n",
"0 573.830444 853.130066 3988.073975 147.432693 3840.641357 234.112061 \n",
"\n",
" tuning_offset \n",
"0 -0.17 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"音高四分位数统计:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pitch_q1 | \n",
" pitch_median | \n",
" pitch_q3 | \n",
" pitch_iqr | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 173.241608 | \n",
" 234.112061 | \n",
" 488.627747 | \n",
" 315.386139 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pitch_q1 pitch_median pitch_q3 pitch_iqr\n",
"0 173.241608 234.112061 488.627747 315.386139"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 测试音高和调谐偏差特征提取,以DataFrame形式展示\n",
"try:\n",
" # 加载一个示例音频文件\n",
" ravdess_file = './RAVDESS/Actor_01/03-01-01-01-01-01-01.wav'\n",
" if os.path.exists(ravdess_file):\n",
" print(f\"使用RAVDESS音频文件进行测试: {ravdess_file}\")\n",
" audio, sr = librosa.load(ravdess_file, sr=SAMPLE_RATE, res_type='kaiser_fast')\n",
" \n",
" # 统一音频长度\n",
" if len(audio) < MAX_SAMPLES:\n",
" # 音频太短,用0填充\n",
" padding = MAX_SAMPLES - len(audio)\n",
" audio = np.pad(audio, (0, padding), 'constant')\n",
" else:\n",
" # 音频太长,截断\n",
" audio = audio[:MAX_SAMPLES]\n",
" \n",
" # 创建一个存储音高和调谐特征的DataFrame\n",
" pitch_df = pd.DataFrame()\n",
" \n",
" # 提取音高特征\n",
" pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)\n",
" \n",
" # 提取每帧最大幅度对应的音高\n",
" pitch_values = []\n",
" times_list = []\n",
" frame_indices = []\n",
" \n",
" for t in range(pitches.shape[1]):\n",
" idx = np.argmax(magnitudes[:, t])\n",
" pitch = pitches[idx, t]\n",
" time = t * 512 / sr # 计算时间点\n",
" \n",
" if pitch > 0: # 过滤掉静音帧\n",
" pitch_values.append(pitch)\n",
" times_list.append(time)\n",
" frame_indices.append(t)\n",
" \n",
" # 创建音高帧级DataFrame\n",
" pitch_frames_df = pd.DataFrame({\n",
" 'frame': frame_indices,\n",
" 'time': times_list,\n",
" 'pitch': pitch_values\n",
" })\n",
" \n",
" # 计算音高统计特征\n",
" pitch_stats = {\n",
" 'pitch_mean': np.mean(pitch_values) if pitch_values else 0,\n",
" 'pitch_std': np.std(pitch_values) if len(pitch_values) > 1 else 0,\n",
" 'pitch_max': np.max(pitch_values) if pitch_values else 0,\n",
" 'pitch_min': np.min(pitch_values) if pitch_values else 0,\n",
" 'pitch_range': np.ptp(pitch_values) if pitch_values else 0, # 峰峰值\n",
" 'pitch_median': np.median(pitch_values) if pitch_values else 0\n",
" }\n",
" \n",
" # 提取调谐偏差\n",
" tuning_offset = librosa.estimate_tuning(y=audio, sr=sr)\n",
" pitch_stats['tuning_offset'] = tuning_offset\n",
" \n",
" # 创建统计特征DataFrame\n",
" pitch_stats_df = pd.DataFrame([pitch_stats])\n",
" \n",
" # 展示结果\n",
" print(\"\\n音高帧级数据 (前10行):\")\n",
" display(pitch_frames_df.head(10))\n",
" \n",
" print(\"\\n音高和调谐统计特征:\")\n",
" display(pitch_stats_df)\n",
" \n",
" # 计算音高四分位数\n",
" q1 = np.percentile(pitch_values, 25) if pitch_values else 0\n",
" q3 = np.percentile(pitch_values, 75) if pitch_values else 0\n",
" iqr = q3 - q1\n",
" \n",
" quartile_stats = {\n",
" 'pitch_q1': q1,\n",
" 'pitch_median': np.median(pitch_values) if pitch_values else 0,\n",
" 'pitch_q3': q3,\n",
" 'pitch_iqr': iqr\n",
" }\n",
" \n",
" print(\"\\n音高四分位数统计:\")\n",
" display(pd.DataFrame([quartile_stats]))\n",
" \n",
" else:\n",
" print(\"RAVDESS音频示例文件不存在\")\n",
" \n",
"except Exception as e:\n",
" print(f\"提取特征时出错: {e}\")\n"
]
},
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": [
"## 2.2 构建频谱质心、光谱平坦度指标的代码\n",
"\n",
"频谱质心是频谱的质心,表示声音的\"亮度\"。频谱平坦度是描述频谱分布平坦程度的指标,可以区分噪声和音调声音。这些特征对于情感识别也非常重要。\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"使用RAVDESS音频文件进行测试: ./RAVDESS/Actor_01/03-01-01-01-01-01-01.wav\n",
"\n",
"频谱质心和频谱平坦度帧级数据 (前10行):\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" frame | \n",
" time | \n",
" spectral_centroid | \n",
" spectral_flatness | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0.00000 | \n",
" 4785.214012 | \n",
" 0.602522 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 0.02322 | \n",
" 4783.377794 | \n",
" 0.729877 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" 0.04644 | \n",
" 4619.471629 | \n",
" 0.619756 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3 | \n",
" 0.06966 | \n",
" 4579.571027 | \n",
" 0.507635 | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
" 0.09288 | \n",
" 4589.515428 | \n",
" 0.441491 | \n",
"
\n",
" \n",
" | 5 | \n",
" 5 | \n",
" 0.11610 | \n",
" 4549.248625 | \n",
" 0.446107 | \n",
"
\n",
" \n",
" | 6 | \n",
" 6 | \n",
" 0.13932 | \n",
" 4404.914089 | \n",
" 0.428921 | \n",
"
\n",
" \n",
" | 7 | \n",
" 7 | \n",
" 0.16254 | \n",
" 4186.607639 | \n",
" 0.286726 | \n",
"
\n",
" \n",
" | 8 | \n",
" 8 | \n",
" 0.18576 | \n",
" 4229.668787 | \n",
" 0.306454 | \n",
"
\n",
" \n",
" | 9 | \n",
" 9 | \n",
" 0.20898 | \n",
" 4518.854683 | \n",
" 0.434580 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" frame time spectral_centroid spectral_flatness\n",
"0 0 0.00000 4785.214012 0.602522\n",
"1 1 0.02322 4783.377794 0.729877\n",
"2 2 0.04644 4619.471629 0.619756\n",
"3 3 0.06966 4579.571027 0.507635\n",
"4 4 0.09288 4589.515428 0.441491\n",
"5 5 0.11610 4549.248625 0.446107\n",
"6 6 0.13932 4404.914089 0.428921\n",
"7 7 0.16254 4186.607639 0.286726\n",
"8 8 0.18576 4229.668787 0.306454\n",
"9 9 0.20898 4518.854683 0.434580"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"频谱质心和频谱平坦度统计特征:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" spectral_centroid_mean | \n",
" spectral_centroid_std | \n",
" spectral_centroid_max | \n",
" spectral_centroid_min | \n",
" spectral_centroid_median | \n",
" spectral_centroid_q1 | \n",
" spectral_centroid_q3 | \n",
" spectral_flatness_mean | \n",
" spectral_flatness_std | \n",
" spectral_flatness_max | \n",
" spectral_flatness_min | \n",
" spectral_flatness_median | \n",
" spectral_flatness_q1 | \n",
" spectral_flatness_q3 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 2086.724971 | \n",
" 1955.973249 | \n",
" 6492.655576 | \n",
" 0.0 | \n",
" 1411.301154 | \n",
" 0.0 | \n",
" 4177.900461 | \n",
" 0.459678 | \n",
" 0.425451 | \n",
" 1.000001 | \n",
" 0.000078 | \n",
" 0.358098 | \n",
" 0.022675 | \n",
" 1.000001 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" spectral_centroid_mean spectral_centroid_std spectral_centroid_max \\\n",
"0 2086.724971 1955.973249 6492.655576 \n",
"\n",
" spectral_centroid_min spectral_centroid_median spectral_centroid_q1 \\\n",
"0 0.0 1411.301154 0.0 \n",
"\n",
" spectral_centroid_q3 spectral_flatness_mean spectral_flatness_std \\\n",
"0 4177.900461 0.459678 0.425451 \n",
"\n",
" spectral_flatness_max spectral_flatness_min spectral_flatness_median \\\n",
"0 1.000001 0.000078 0.358098 \n",
"\n",
" spectral_flatness_q1 spectral_flatness_q3 \n",
"0 0.022675 1.000001 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"频谱质心和频谱平坦度相关性: -0.4960\n",
"\n",
"百分位数统计:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" centroid_p10 | \n",
" flatness_p10 | \n",
" centroid_p25 | \n",
" flatness_p25 | \n",
" centroid_p50 | \n",
" flatness_p50 | \n",
" centroid_p75 | \n",
" flatness_p75 | \n",
" centroid_p90 | \n",
" flatness_p90 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.0 | \n",
" 0.000577 | \n",
" 0.0 | \n",
" 0.022675 | \n",
" 1411.301154 | \n",
" 0.358098 | \n",
" 4177.900461 | \n",
" 1.000001 | \n",
" 4703.581505 | \n",
" 1.000001 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" centroid_p10 flatness_p10 centroid_p25 flatness_p25 centroid_p50 \\\n",
"0 0.0 0.000577 0.0 0.022675 1411.301154 \n",
"\n",
" flatness_p50 centroid_p75 flatness_p75 centroid_p90 flatness_p90 \n",
"0 0.358098 4177.900461 1.000001 4703.581505 1.000001 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 测试频谱质心和频谱平坦度特征提取,并以DataFrame形式展示\n",
"try:\n",
" # 加载一个示例音频文件\n",
" ravdess_file = './RAVDESS/Actor_01/03-01-01-01-01-01-01.wav'\n",
" if os.path.exists(ravdess_file):\n",
" print(f\"使用RAVDESS音频文件进行测试: {ravdess_file}\")\n",
" audio, sr = librosa.load(ravdess_file, sr=SAMPLE_RATE, res_type='kaiser_fast')\n",
" \n",
" # 统一音频长度\n",
" if len(audio) < MAX_SAMPLES:\n",
" padding = MAX_SAMPLES - len(audio)\n",
" audio = np.pad(audio, (0, padding), 'constant')\n",
" else:\n",
" audio = audio[:MAX_SAMPLES]\n",
" \n",
" # 提取频谱质心\n",
" spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]\n",
" \n",
" # 提取频谱平坦度\n",
" spectral_flatness = librosa.feature.spectral_flatness(y=audio)[0]\n",
" \n",
" # 获取时间点\n",
" times = librosa.times_like(spectral_centroid)\n",
" \n",
" # 创建帧级特征DataFrame\n",
" frames_data = []\n",
" for i, t in enumerate(times):\n",
" if i < len(spectral_centroid) and i < len(spectral_flatness):\n",
" frames_data.append({\n",
" 'frame': i,\n",
" 'time': t,\n",
" 'spectral_centroid': spectral_centroid[i],\n",
" 'spectral_flatness': spectral_flatness[i]\n",
" })\n",
" \n",
" # 创建DataFrame\n",
" frames_df = pd.DataFrame(frames_data)\n",
" \n",
" # 计算统计特征\n",
" centroid_stats = {\n",
" 'spectral_centroid_mean': np.mean(spectral_centroid),\n",
" 'spectral_centroid_std': np.std(spectral_centroid),\n",
" 'spectral_centroid_max': np.max(spectral_centroid),\n",
" 'spectral_centroid_min': np.min(spectral_centroid),\n",
" 'spectral_centroid_median': np.median(spectral_centroid),\n",
" 'spectral_centroid_q1': np.percentile(spectral_centroid, 25),\n",
" 'spectral_centroid_q3': np.percentile(spectral_centroid, 75)\n",
" }\n",
" \n",
" flatness_stats = {\n",
" 'spectral_flatness_mean': np.mean(spectral_flatness),\n",
" 'spectral_flatness_std': np.std(spectral_flatness),\n",
" 'spectral_flatness_max': np.max(spectral_flatness),\n",
" 'spectral_flatness_min': np.min(spectral_flatness),\n",
" 'spectral_flatness_median': np.median(spectral_flatness),\n",
" 'spectral_flatness_q1': np.percentile(spectral_flatness, 25),\n",
" 'spectral_flatness_q3': np.percentile(spectral_flatness, 75)\n",
" }\n",
" \n",
" # 合并统计特征\n",
" stats = {**centroid_stats, **flatness_stats}\n",
" stats_df = pd.DataFrame([stats])\n",
" \n",
" # 显示帧级特征(前10行)\n",
" print(\"\\n频谱质心和频谱平坦度帧级数据 (前10行):\")\n",
" display(frames_df.head(10))\n",
" \n",
" # 显示统计特征\n",
" print(\"\\n频谱质心和频谱平坦度统计特征:\")\n",
" display(stats_df)\n",
" \n",
" # 计算相关性\n",
" correlation = np.corrcoef(spectral_centroid, spectral_flatness)[0, 1]\n",
" print(f\"\\n频谱质心和频谱平坦度相关性: {correlation:.4f}\")\n",
" \n",
" # 创建百分位数统计\n",
" percentiles = [10, 25, 50, 75, 90]\n",
" percentile_stats = {}\n",
" \n",
" for p in percentiles:\n",
" percentile_stats[f'centroid_p{p}'] = np.percentile(spectral_centroid, p)\n",
" percentile_stats[f'flatness_p{p}'] = np.percentile(spectral_flatness, p)\n",
" \n",
" print(\"\\n百分位数统计:\")\n",
" display(pd.DataFrame([percentile_stats]))\n",
" else:\n",
" print(\"RAVDESS音频示例文件不存在\")\n",
"except Exception as e:\n",
" print(f\"提取特征时出错: {e}\")\n"
]
},
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": [
"## 2.3 构建梅尔频率、光谱对比度指标的代码\n",
"\n",
"梅尔频率倒谱系数(MFCC)是音频处理中最常用的特征,模拟人耳对声音的感知。光谱对比度衡量谱带之间的差异,能够捕捉音调和节奏的变化。\n"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"使用RAVDESS音频文件进行测试: ./RAVDESS/Actor_01/03-01-01-01-01-01-01.wav\n",
"\n",
"MFCC帧级数据 (前5行):\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" frame | \n",
" time | \n",
" mfcc_1 | \n",
" mfcc_2 | \n",
" mfcc_3 | \n",
" mfcc_4 | \n",
" mfcc_5 | \n",
" mfcc_6 | \n",
" mfcc_7 | \n",
" mfcc_8 | \n",
" mfcc_9 | \n",
" mfcc_10 | \n",
" mfcc_11 | \n",
" mfcc_12 | \n",
" mfcc_13 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0.00000 | \n",
" -857.305847 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 0.02322 | \n",
" -857.305847 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" 0.04644 | \n",
" -857.305847 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3 | \n",
" 0.06966 | \n",
" -857.305847 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
" 0.09288 | \n",
" -857.305847 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" frame time mfcc_1 mfcc_2 mfcc_3 mfcc_4 mfcc_5 mfcc_6 mfcc_7 \\\n",
"0 0 0.00000 -857.305847 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"1 1 0.02322 -857.305847 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2 2 0.04644 -857.305847 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"3 3 0.06966 -857.305847 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 4 0.09288 -857.305847 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
" mfcc_8 mfcc_9 mfcc_10 mfcc_11 mfcc_12 mfcc_13 \n",
"0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 0.0 0.0 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"光谱对比度帧级数据 (前5行):\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" frame | \n",
" time | \n",
" contrast_band_1 | \n",
" contrast_band_2 | \n",
" contrast_band_3 | \n",
" contrast_band_4 | \n",
" contrast_band_5 | \n",
" contrast_band_6 | \n",
" contrast_band_7 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0.00000 | \n",
" 0.261872 | \n",
" 0.925997 | \n",
" 4.476748 | \n",
" 2.103019 | \n",
" 4.643754 | \n",
" 9.242361 | \n",
" 41.680347 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 0.02322 | \n",
" 2.949578 | \n",
" 3.694045 | \n",
" 7.620502 | \n",
" 15.030630 | \n",
" 11.917979 | \n",
" 14.289726 | \n",
" 40.687895 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" 0.04644 | \n",
" 2.732142 | \n",
" 5.549647 | \n",
" 9.467184 | \n",
" 7.868653 | \n",
" 15.430449 | \n",
" 10.744310 | \n",
" 38.656327 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3 | \n",
" 0.06966 | \n",
" 8.873114 | \n",
" 6.092097 | \n",
" 13.238993 | \n",
" 9.561276 | \n",
" 13.876824 | \n",
" 12.928647 | \n",
" 38.942839 | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
" 0.09288 | \n",
" 13.186358 | \n",
" 7.774552 | \n",
" 9.024843 | \n",
" 14.182148 | \n",
" 13.927470 | \n",
" 12.691566 | \n",
" 40.009761 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" frame time contrast_band_1 contrast_band_2 contrast_band_3 \\\n",
"0 0 0.00000 0.261872 0.925997 4.476748 \n",
"1 1 0.02322 2.949578 3.694045 7.620502 \n",
"2 2 0.04644 2.732142 5.549647 9.467184 \n",
"3 3 0.06966 8.873114 6.092097 13.238993 \n",
"4 4 0.09288 13.186358 7.774552 9.024843 \n",
"\n",
" contrast_band_4 contrast_band_5 contrast_band_6 contrast_band_7 \n",
"0 2.103019 4.643754 9.242361 41.680347 \n",
"1 15.030630 11.917979 14.289726 40.687895 \n",
"2 7.868653 15.430449 10.744310 38.656327 \n",
"3 9.561276 13.876824 12.928647 38.942839 \n",
"4 14.182148 13.927470 12.691566 40.009761 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"MFCC和光谱对比度统计特征 (前10个):\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" mfcc_1_mean | \n",
" mfcc_1_std | \n",
" mfcc_1_max | \n",
" mfcc_1_min | \n",
" mfcc_2_mean | \n",
" mfcc_2_std | \n",
" mfcc_2_max | \n",
" mfcc_2_min | \n",
" mfcc_3_mean | \n",
" mfcc_3_std | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" -753.195312 | \n",
" 164.561844 | \n",
" -394.404266 | \n",
" -857.305847 | \n",
" 38.492062 | \n",
" 66.268105 | \n",
" 203.227997 | \n",
" -94.004196 | \n",
" -1.693892 | \n",
" 16.70042 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" mfcc_1_mean mfcc_1_std mfcc_1_max mfcc_1_min mfcc_2_mean mfcc_2_std \\\n",
"0 -753.195312 164.561844 -394.404266 -857.305847 38.492062 66.268105 \n",
"\n",
" mfcc_2_max mfcc_2_min mfcc_3_mean mfcc_3_std \n",
"0 203.227997 -94.004196 -1.693892 16.70042 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"总共提取了 134 个特征\n",
"\n",
"特征数量统计:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" MFCC原始特征 | \n",
" MFCC一阶差分 | \n",
" MFCC二阶差分 | \n",
" 光谱对比度特征 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 特征数 | \n",
" 52 | \n",
" 26 | \n",
" 26 | \n",
" 30 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" MFCC原始特征 MFCC一阶差分 MFCC二阶差分 光谱对比度特征\n",
"特征数 52 26 26 30"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 测试MFCC和光谱对比度特征提取,以DataFrame形式展示\n",
"try:\n",
" # 加载一个示例音频文件\n",
" ravdess_file = './RAVDESS/Actor_01/03-01-01-01-01-01-01.wav'\n",
" if os.path.exists(ravdess_file):\n",
" print(f\"使用RAVDESS音频文件进行测试: {ravdess_file}\")\n",
" audio, sr = librosa.load(ravdess_file, sr=SAMPLE_RATE, res_type='kaiser_fast')\n",
" \n",
" # 统一音频长度\n",
" if len(audio) < MAX_SAMPLES:\n",
" padding = MAX_SAMPLES - len(audio)\n",
" audio = np.pad(audio, (0, padding), 'constant')\n",
" else:\n",
" audio = audio[:MAX_SAMPLES]\n",
" \n",
" # 提取MFCC特征\n",
" n_mfcc = 13 # MFCC系数数量\n",
" mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)\n",
" \n",
" # 提取光谱对比度特征\n",
" spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)\n",
" \n",
" # 创建MFCC帧级特征DataFrame\n",
" mfcc_frames = []\n",
" times = librosa.times_like(mfccs[0])\n",
" \n",
" for t in range(len(times)):\n",
" frame_data = {'frame': t, 'time': times[t]}\n",
" \n",
" # 添加每个MFCC系数\n",
" for i in range(n_mfcc):\n",
" if t < len(mfccs[i]):\n",
" frame_data[f'mfcc_{i+1}'] = mfccs[i][t]\n",
" \n",
" mfcc_frames.append(frame_data)\n",
" \n",
" mfcc_frames_df = pd.DataFrame(mfcc_frames)\n",
" \n",
" # 创建MFCC统计特征\n",
" mfcc_stats = {}\n",
" for i in range(n_mfcc):\n",
" mfcc_stats[f'mfcc_{i+1}_mean'] = np.mean(mfccs[i])\n",
" mfcc_stats[f'mfcc_{i+1}_std'] = np.std(mfccs[i])\n",
" mfcc_stats[f'mfcc_{i+1}_max'] = np.max(mfccs[i])\n",
" mfcc_stats[f'mfcc_{i+1}_min'] = np.min(mfccs[i])\n",
" \n",
" # 计算MFCC的一阶差分\n",
" mfcc_delta = librosa.feature.delta(mfccs)\n",
" for i in range(n_mfcc):\n",
" mfcc_stats[f'mfcc_{i+1}_delta_mean'] = np.mean(mfcc_delta[i])\n",
" mfcc_stats[f'mfcc_{i+1}_delta_std'] = np.std(mfcc_delta[i])\n",
" \n",
" # 计算MFCC的二阶差分\n",
" mfcc_delta2 = librosa.feature.delta(mfccs, order=2)\n",
" for i in range(n_mfcc):\n",
" mfcc_stats[f'mfcc_{i+1}_delta2_mean'] = np.mean(mfcc_delta2[i])\n",
" mfcc_stats[f'mfcc_{i+1}_delta2_std'] = np.std(mfcc_delta2[i])\n",
" \n",
" # 创建光谱对比度帧级特征DataFrame\n",
" contrast_frames = []\n",
" contrast_times = librosa.times_like(spectral_contrast[0])\n",
" \n",
" for t in range(len(contrast_times)):\n",
" frame_data = {'frame': t, 'time': contrast_times[t]}\n",
" \n",
" # 添加每个频带的对比度\n",
" for i in range(spectral_contrast.shape[0]):\n",
" if t < len(spectral_contrast[i]):\n",
" frame_data[f'contrast_band_{i+1}'] = spectral_contrast[i][t]\n",
" \n",
" contrast_frames.append(frame_data)\n",
" \n",
" contrast_frames_df = pd.DataFrame(contrast_frames)\n",
" \n",
" # 创建光谱对比度统计特征\n",
" contrast_stats = {}\n",
" for i in range(spectral_contrast.shape[0]):\n",
" contrast_stats[f'contrast_band_{i+1}_mean'] = np.mean(spectral_contrast[i])\n",
" contrast_stats[f'contrast_band_{i+1}_std'] = np.std(spectral_contrast[i])\n",
" contrast_stats[f'contrast_band_{i+1}_max'] = np.max(spectral_contrast[i])\n",
" contrast_stats[f'contrast_band_{i+1}_min'] = np.min(spectral_contrast[i])\n",
" \n",
" # 光谱对比度的总体统计\n",
" contrast_stats['spectral_contrast_mean'] = np.mean(spectral_contrast)\n",
" contrast_stats['spectral_contrast_std'] = np.std(spectral_contrast)\n",
" \n",
" # 合并MFCC和光谱对比度统计特征\n",
" all_stats = {**mfcc_stats, **contrast_stats}\n",
" all_stats_df = pd.DataFrame([all_stats])\n",
" \n",
" # 显示结果\n",
" print(\"\\nMFCC帧级数据 (前5行):\")\n",
" display(mfcc_frames_df.head(5))\n",
" \n",
" print(\"\\n光谱对比度帧级数据 (前5行):\")\n",
" display(contrast_frames_df.head(5))\n",
" \n",
" print(\"\\nMFCC和光谱对比度统计特征 (前10个):\")\n",
" cols = list(all_stats_df.columns)[:10]\n",
" display(all_stats_df[cols])\n",
" \n",
" print(f\"\\n总共提取了 {len(all_stats)} 个特征\")\n",
" \n",
" # 按特征类型分组统计\n",
" feature_counts = {\n",
" 'MFCC原始特征': n_mfcc * 4,\n",
" 'MFCC一阶差分': n_mfcc * 2,\n",
" 'MFCC二阶差分': n_mfcc * 2,\n",
" '光谱对比度特征': spectral_contrast.shape[0] * 4 + 2\n",
" }\n",
" \n",
" print(\"\\n特征数量统计:\")\n",
" display(pd.DataFrame([feature_counts], index=['特征数']))\n",
" else:\n",
" print(\"RAVDESS音频示例文件不存在\")\n",
"except Exception as e:\n",
" print(f\"提取特征时出错: {e}\")\n"
]
},
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": [
"## 2.4 构建均方根能量、色谱图指标的代码\n",
"\n",
"均方根能量(RMS)反映了音频信号的能量变化,可以指示情感的强度。色谱图特征可以捕捉音高类特征,对分析情感变化很有帮助。\n"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"使用RAVDESS音频文件进行测试: ./RAVDESS/Actor_01/03-01-01-01-01-01-01.wav\n",
"\n",
"RMS和ZCR帧级数据 (前5行):\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" frame | \n",
" time | \n",
" rms | \n",
" zcr | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0.00000 | \n",
" 6.232337e-07 | \n",
" 0.041504 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 0.02322 | \n",
" 9.135177e-07 | \n",
" 0.069824 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" 0.04644 | \n",
" 1.276243e-06 | \n",
" 0.131348 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3 | \n",
" 0.06966 | \n",
" 1.604472e-06 | \n",
" 0.173828 | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
" 0.09288 | \n",
" 1.715774e-06 | \n",
" 0.193848 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" frame time rms zcr\n",
"0 0 0.00000 6.232337e-07 0.041504\n",
"1 1 0.02322 9.135177e-07 0.069824\n",
"2 2 0.04644 1.276243e-06 0.131348\n",
"3 3 0.06966 1.604472e-06 0.173828\n",
"4 4 0.09288 1.715774e-06 0.193848"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"色谱图帧级数据 (前5行):\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" frame | \n",
" time | \n",
" chroma_1 | \n",
" chroma_2 | \n",
" chroma_3 | \n",
" chroma_4 | \n",
" chroma_5 | \n",
" chroma_6 | \n",
" chroma_7 | \n",
" chroma_8 | \n",
" chroma_9 | \n",
" chroma_10 | \n",
" chroma_11 | \n",
" chroma_12 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0.00000 | \n",
" 0.534806 | \n",
" 0.779990 | \n",
" 1.000000 | \n",
" 0.884072 | \n",
" 0.547614 | \n",
" 0.277783 | \n",
" 0.223865 | \n",
" 0.226283 | \n",
" 0.266252 | \n",
" 0.372628 | \n",
" 0.348106 | \n",
" 0.348874 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 0.02322 | \n",
" 0.620370 | \n",
" 0.831862 | \n",
" 1.000000 | \n",
" 0.892350 | \n",
" 0.599051 | \n",
" 0.378208 | \n",
" 0.319750 | \n",
" 0.332158 | \n",
" 0.362094 | \n",
" 0.462163 | \n",
" 0.450810 | \n",
" 0.447287 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" 0.04644 | \n",
" 0.829280 | \n",
" 0.769134 | \n",
" 0.754131 | \n",
" 0.629990 | \n",
" 0.650650 | \n",
" 0.756760 | \n",
" 0.818477 | \n",
" 0.791697 | \n",
" 0.664502 | \n",
" 0.579084 | \n",
" 0.769260 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3 | \n",
" 0.06966 | \n",
" 0.998285 | \n",
" 0.815828 | \n",
" 0.739085 | \n",
" 0.851994 | \n",
" 0.784696 | \n",
" 0.790131 | \n",
" 1.000000 | \n",
" 0.952804 | \n",
" 0.767395 | \n",
" 0.758635 | \n",
" 0.780154 | \n",
" 0.926421 | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
" 0.09288 | \n",
" 1.000000 | \n",
" 0.770530 | \n",
" 0.642109 | \n",
" 0.863174 | \n",
" 0.896374 | \n",
" 0.837818 | \n",
" 0.923541 | \n",
" 0.894363 | \n",
" 0.866183 | \n",
" 0.789348 | \n",
" 0.698850 | \n",
" 0.754964 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" frame time chroma_1 chroma_2 chroma_3 chroma_4 chroma_5 chroma_6 \\\n",
"0 0 0.00000 0.534806 0.779990 1.000000 0.884072 0.547614 0.277783 \n",
"1 1 0.02322 0.620370 0.831862 1.000000 0.892350 0.599051 0.378208 \n",
"2 2 0.04644 0.829280 0.769134 0.754131 0.629990 0.650650 0.756760 \n",
"3 3 0.06966 0.998285 0.815828 0.739085 0.851994 0.784696 0.790131 \n",
"4 4 0.09288 1.000000 0.770530 0.642109 0.863174 0.896374 0.837818 \n",
"\n",
" chroma_7 chroma_8 chroma_9 chroma_10 chroma_11 chroma_12 \n",
"0 0.223865 0.226283 0.266252 0.372628 0.348106 0.348874 \n",
"1 0.319750 0.332158 0.362094 0.462163 0.450810 0.447287 \n",
"2 0.818477 0.791697 0.664502 0.579084 0.769260 1.000000 \n",
"3 1.000000 0.952804 0.767395 0.758635 0.780154 0.926421 \n",
"4 0.923541 0.894363 0.866183 0.789348 0.698850 0.754964 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RMS和ZCR统计特征:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" rms_mean | \n",
" rms_std | \n",
" rms_max | \n",
" rms_min | \n",
" rms_median | \n",
" rms_q1 | \n",
" rms_q3 | \n",
" zero_crossing_rate_mean | \n",
" zero_crossing_rate_std | \n",
" zero_crossing_rate_max | \n",
" zero_crossing_rate_min | \n",
" zero_crossing_rate_median | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.001493 | \n",
" 0.002882 | \n",
" 0.012591 | \n",
" 0.0 | \n",
" 0.000003 | \n",
" 0.0 | \n",
" 0.000946 | \n",
" 0.149477 | \n",
" 0.170416 | \n",
" 0.52832 | \n",
" 0.0 | \n",
" 0.069336 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" rms_mean rms_std rms_max rms_min rms_median rms_q1 rms_q3 \\\n",
"0 0.001493 0.002882 0.012591 0.0 0.000003 0.0 0.000946 \n",
"\n",
" zero_crossing_rate_mean zero_crossing_rate_std zero_crossing_rate_max \\\n",
"0 0.149477 0.170416 0.52832 \n",
"\n",
" zero_crossing_rate_min zero_crossing_rate_median \n",
"0 0.0 0.069336 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"色谱图统计特征 (前10个):\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" chroma_1_mean | \n",
" chroma_1_std | \n",
" chroma_1_max | \n",
" chroma_1_min | \n",
" chroma_2_mean | \n",
" chroma_2_std | \n",
" chroma_2_max | \n",
" chroma_2_min | \n",
" chroma_3_mean | \n",
" chroma_3_std | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.44199 | \n",
" 0.396882 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.397397 | \n",
" 0.365359 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.340485 | \n",
" 0.340303 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" chroma_1_mean chroma_1_std chroma_1_max chroma_1_min chroma_2_mean \\\n",
"0 0.44199 0.396882 1.0 0.0 0.397397 \n",
"\n",
" chroma_2_std chroma_2_max chroma_2_min chroma_3_mean chroma_3_std \n",
"0 0.365359 1.0 0.0 0.340485 0.340303 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"梅尔频谱相关特征:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" mel_spec_mean | \n",
" mel_spec_std | \n",
" log_power_mean | \n",
" log_power_std | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0.00239 | \n",
" 0.033072 | \n",
" -69.638008 | \n",
" 7.839996 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" mel_spec_mean mel_spec_std log_power_mean log_power_std\n",
"0 0.00239 0.033072 -69.638008 7.839996"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"总共提取了 64 个特征\n",
"\n",
"特征数量统计:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" RMS能量特征 | \n",
" 零交叉率特征 | \n",
" 色谱图特征 | \n",
" 梅尔频谱相关特征 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 特征数 | \n",
" 7 | \n",
" 5 | \n",
" 48 | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" RMS能量特征 零交叉率特征 色谱图特征 梅尔频谱相关特征\n",
"特征数 7 5 48 4"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 测试RMS、ZCR和色谱图特征提取,以DataFrame形式展示\n",
"try:\n",
" # 加载一个示例音频文件\n",
" ravdess_file = './RAVDESS/Actor_01/03-01-01-01-01-01-01.wav'\n",
" if os.path.exists(ravdess_file):\n",
" print(f\"使用RAVDESS音频文件进行测试: {ravdess_file}\")\n",
" audio, sr = librosa.load(ravdess_file, sr=SAMPLE_RATE, res_type='kaiser_fast')\n",
" \n",
" # 统一音频长度\n",
" if len(audio) < MAX_SAMPLES:\n",
" padding = MAX_SAMPLES - len(audio)\n",
" audio = np.pad(audio, (0, padding), 'constant')\n",
" else:\n",
" audio = audio[:MAX_SAMPLES]\n",
" \n",
" # 提取RMS\n",
" rms = librosa.feature.rms(y=audio)[0]\n",
" \n",
" # 提取零交叉率\n",
" zcr = librosa.feature.zero_crossing_rate(audio)[0]\n",
" \n",
" # 提取色谱图\n",
" chroma = librosa.feature.chroma_stft(y=audio, sr=sr)\n",
" \n",
" # 提取梅尔频谱\n",
" mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr)\n",
" log_power = librosa.amplitude_to_db(mel_spec)\n",
" \n",
" # 创建RMS和ZCR帧级特征DataFrame\n",
" energy_frames = []\n",
" times = librosa.times_like(rms)\n",
" \n",
" for t in range(len(times)):\n",
" if t < len(rms) and t < len(zcr):\n",
" frame_data = {\n",
" 'frame': t,\n",
" 'time': times[t],\n",
" 'rms': rms[t],\n",
" 'zcr': zcr[t]\n",
" }\n",
" energy_frames.append(frame_data)\n",
" \n",
" energy_frames_df = pd.DataFrame(energy_frames)\n",
" \n",
" # 创建色谱图帧级特征DataFrame\n",
" chroma_frames = []\n",
" chroma_times = librosa.times_like(chroma[0])\n",
" \n",
" for t in range(len(chroma_times)):\n",
" frame_data = {'frame': t, 'time': chroma_times[t]}\n",
" \n",
" # 添加每个色度的值\n",
" for i in range(chroma.shape[0]):\n",
" if t < len(chroma[i]):\n",
" frame_data[f'chroma_{i+1}'] = chroma[i][t]\n",
" \n",
" chroma_frames.append(frame_data)\n",
" \n",
" chroma_frames_df = pd.DataFrame(chroma_frames)\n",
" \n",
" # 计算RMS和ZCR统计特征\n",
" energy_stats = {\n",
" 'rms_mean': np.mean(rms),\n",
" 'rms_std': np.std(rms),\n",
" 'rms_max': np.max(rms),\n",
" 'rms_min': np.min(rms),\n",
" 'rms_median': np.median(rms),\n",
" 'rms_q1': np.percentile(rms, 25),\n",
" 'rms_q3': np.percentile(rms, 75),\n",
" 'zero_crossing_rate_mean': np.mean(zcr),\n",
" 'zero_crossing_rate_std': np.std(zcr),\n",
" 'zero_crossing_rate_max': np.max(zcr),\n",
" 'zero_crossing_rate_min': np.min(zcr),\n",
" 'zero_crossing_rate_median': np.median(zcr)\n",
" }\n",
" \n",
" # 计算色谱图统计特征\n",
" chroma_stats = {}\n",
" for i in range(chroma.shape[0]):\n",
" chroma_stats[f'chroma_{i+1}_mean'] = np.mean(chroma[i])\n",
" chroma_stats[f'chroma_{i+1}_std'] = np.std(chroma[i])\n",
" chroma_stats[f'chroma_{i+1}_max'] = np.max(chroma[i])\n",
" chroma_stats[f'chroma_{i+1}_min'] = np.min(chroma[i])\n",
" \n",
" # 添加梅尔频谱相关特征\n",
" mel_stats = {\n",
" 'mel_spec_mean': np.mean(mel_spec),\n",
" 'mel_spec_std': np.std(mel_spec),\n",
" 'log_power_mean': np.mean(log_power),\n",
" 'log_power_std': np.std(log_power)\n",
" }\n",
" \n",
" # 合并所有统计特征\n",
" all_stats = {**energy_stats, **chroma_stats, **mel_stats}\n",
" all_stats_df = pd.DataFrame([all_stats])\n",
" \n",
" # 显示结果\n",
" print(\"\\nRMS和ZCR帧级数据 (前5行):\")\n",
" display(energy_frames_df.head(5))\n",
" \n",
" print(\"\\n色谱图帧级数据 (前5行):\")\n",
" display(chroma_frames_df.head(5))\n",
" \n",
" print(\"\\nRMS和ZCR统计特征:\")\n",
" energy_cols = [col for col in all_stats_df.columns if col.startswith('rms') or col.startswith('zero')]\n",
" display(all_stats_df[energy_cols])\n",
" \n",
" print(\"\\n色谱图统计特征 (前10个):\")\n",
" chroma_cols = [col for col in all_stats_df.columns if col.startswith('chroma')][:10]\n",
" display(all_stats_df[chroma_cols])\n",
" \n",
" print(\"\\n梅尔频谱相关特征:\")\n",
" mel_cols = [col for col in all_stats_df.columns if col.startswith('mel') or col.startswith('log')]\n",
" display(all_stats_df[mel_cols])\n",
" \n",
" print(f\"\\n总共提取了 {len(all_stats)} 个特征\")\n",
" \n",
" # 特征数量统计\n",
" feature_counts = {\n",
" 'RMS能量特征': len([col for col in all_stats_df.columns if col.startswith('rms')]),\n",
" '零交叉率特征': len([col for col in all_stats_df.columns if col.startswith('zero')]),\n",
" '色谱图特征': len([col for col in all_stats_df.columns if col.startswith('chroma')]),\n",
" '梅尔频谱相关特征': len([col for col in all_stats_df.columns if col.startswith('mel') or col.startswith('log')])\n",
" }\n",
" \n",
" print(\"\\n特征数量统计:\")\n",
" display(pd.DataFrame([feature_counts], index=['特征数']))\n",
" else:\n",
" print(\"RAVDESS音频示例文件不存在\")\n",
"except Exception as e:\n",
" print(f\"提取特征时出错: {e}\")\n"
]
},
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": [
"## 2.5 进行特征提取与处理\n",
"\n",
"接下来我们将从音频数据集(RAVDESS、CASIA和SAVEE)中提取特征\n"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"正在加载音频数据...\n",
"从RAVDESS数据集加载了 300 个样本\n",
"每种情感的限制: 50\n",
"情感分布:\n",
" angry: 50\n",
" fear: 50\n",
" happy: 50\n",
" neutral: 50\n",
" sad: 50\n",
" surprise: 50\n",
"从CASIA数据集加载了 300 个样本\n",
"每种情感的限制: 50\n",
"情感分布:\n",
" angry: 50\n",
" fear: 50\n",
" happy: 50\n",
" neutral: 50\n",
" sad: 50\n",
" surprise: 50\n",
"从SAVEE数据集加载了 300 个样本\n",
"每种情感的限制: 50\n",
"情感分布:\n",
" angry: 50\n",
" fear: 50\n",
" happy: 50\n",
" neutral: 50\n",
" sad: 50\n",
" surprise: 50\n",
"总共加载了 900 个样本\n",
"\n",
"正在提取特征...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"提取特征: 100%|██████████| 900/900 [00:35<00:00, 25.29it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"提取了 21 个特征\n",
"\n",
"划分数据集...\n",
"训练集: 540 样本\n",
"验证集: 180 样本\n",
"测试集: 180 样本\n",
"\n",
"训练集情感分布:\n",
" angry: 90\n",
" fear: 90\n",
" happy: 90\n",
" neutral: 90\n",
" sad: 90\n",
" surprise: 90\n",
"\n",
"验证集情感分布:\n",
" angry: 30\n",
" fear: 30\n",
" happy: 30\n",
" neutral: 30\n",
" sad: 30\n",
" surprise: 30\n",
"\n",
"测试集情感分布:\n",
" angry: 30\n",
" fear: 30\n",
" happy: 30\n",
" neutral: 30\n",
" sad: 30\n",
" surprise: 30\n",
"\n",
"标准化特征...\n",
"重塑为LSTM输入格式...\n",
"编码标签...\n",
"类别映射: {np.str_('angry'): 0, np.str_('fear'): 1, np.str_('happy'): 2, np.str_('neutral'): 3, np.str_('sad'): 4, np.str_('surprise'): 5}\n",
"\n",
"LSTM输入形状:\n",
"训练集: (540, 1, 21)\n",
"验证集: (180, 1, 21)\n",
"测试集: (180, 1, 21)\n",
"训练标签: (540, 6)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"#指定要处理的情感类别\n",
"SELECTED_EMOTIONS = ['angry', 'fear', 'happy', 'neutral', 'sad', 'surprise']\n",
"\n",
"# 情绪映射字典,将不同数据集的标签映射到统一的情感类别\n",
"EMOTION_MAPPING = {\n",
" # CASIA 情感映射\n",
" 'angry': 'angry',\n",
" 'fear': 'fear',\n",
" 'happy': 'happy',\n",
" 'neutral': 'neutral',\n",
" 'sad': 'sad',\n",
" 'surprise': 'surprise',\n",
" \n",
" # SAVEE 情感映射\n",
" 'a': 'angry',\n",
" 'f': 'fear',\n",
" 'h': 'happy',\n",
" 'n': 'neutral',\n",
" 'sa': 'sad',\n",
" 'su': 'surprise',\n",
" 'd': 'disgust', # 注意:SAVEE有厌恶情绪,但不在我们选择的情感中\n",
" \n",
" # RAVDESS 情感映射\n",
" '01': 'neutral',\n",
" '02': 'calm', # 注意:RAVDESS有平静情绪,但不在我们选择的情感中\n",
" '03': 'happy',\n",
" '04': 'sad',\n",
" '05': 'angry',\n",
" '06': 'fear',\n",
" '07': 'disgust', # 注意:RAVDESS有厌恶情绪,但不在我们选择的情感中\n",
" '08': 'surprise'\n",
"}\n",
"\n",
"# 语言映射\n",
"LANGUAGE_MAPPING = {\n",
" 'casia': 'zh', # 中文\n",
" 'savee': 'en', # 英文\n",
" 'ravdess': 'en' # 英文\n",
"}\n",
"\n",
"def load_ravdess_data(data_path='./RAVDESS', limit=None):\n",
" \"\"\"\n",
" 加载RAVDESS英文情感语音数据集\n",
" \n",
" Args:\n",
" data_path: RAVDESS数据集路径\n",
" limit: 每种情感加载的最大文件数量\n",
" \n",
" Returns:\n",
" data_list: 包含(音频数据, 情感标签)的列表\n",
" \"\"\"\n",
" data_list = []\n",
" \n",
" # 确保路径存在\n",
" if not os.path.exists(data_path):\n",
" print(f\"警告: RAVDESS数据路径不存在: {data_path}\")\n",
" return data_list\n",
" \n",
" # 获取所有语音目录\n",
" try:\n",
" actor_dirs = glob.glob(os.path.join(data_path, \"Actor_*\"))\n",
" if not actor_dirs:\n",
" print(f\"警告: RAVDESS语音目录为空: {data_path}\")\n",
" except Exception as e:\n",
" print(f\"错误: 无法获取RAVDESS语音目录: {e}\")\n",
" return data_list\n",
" \n",
" # 计算每种情感的文件数\n",
" emotion_counts = {}\n",
" \n",
" for actor_dir in actor_dirs:\n",
" audio_files = glob.glob(os.path.join(actor_dir, \"*.wav\"))\n",
" \n",
" for audio_file in audio_files:\n",
" file_name = os.path.basename(audio_file)\n",
" \n",
" # RAVDESS文件名格式: 03-01-05-01-02-01-12.wav\n",
" # 05 表示情感类别 (angry)\n",
" parts = file_name.split('-')\n",
" if len(parts) >= 3:\n",
" emotion = parts[2]\n",
" \n",
" # 检查是否支持该情感\n",
" if emotion in EMOTION_MAPPING:\n",
" mapped_emotion = EMOTION_MAPPING[emotion]\n",
" \n",
" # 只选择我们指定的情感类别\n",
" if mapped_emotion in SELECTED_EMOTIONS:\n",
" # 检查是否达到了每种情感的限制\n",
" if limit is not None:\n",
" emotion_counts[mapped_emotion] = emotion_counts.get(mapped_emotion, 0) + 1\n",
" if emotion_counts[mapped_emotion] > limit:\n",
" continue\n",
" \n",
" try:\n",
" # 加载音频\n",
" audio, sr = librosa.load(audio_file, sr=SAMPLE_RATE, res_type='kaiser_fast')\n",
" \n",
" # 统一音频长度\n",
" if len(audio) < MAX_SAMPLES:\n",
" # 音频太短,用0填充\n",
" padding = MAX_SAMPLES - len(audio)\n",
" audio = np.pad(audio, (0, padding), 'constant')\n",
" else:\n",
" # 音频太长,截断\n",
" audio = audio[:MAX_SAMPLES]\n",
" \n",
" # 将数据和标签添加到列表\n",
" data_list.append((audio, mapped_emotion))\n",
" \n",
" except Exception as e:\n",
" print(f\"处理文件 {audio_file} 时出错: {e}\")\n",
" \n",
" print(f\"从RAVDESS数据集加载了 {len(data_list)} 个样本\")\n",
" if limit is not None:\n",
" print(f\"每种情感的限制: {limit}\")\n",
" \n",
" # 打印每种情感的样本数量\n",
" emotion_distribution = {}\n",
" for _, emotion in data_list:\n",
" emotion_distribution[emotion] = emotion_distribution.get(emotion, 0) + 1\n",
" \n",
" print(\"情感分布:\")\n",
" for emotion in SELECTED_EMOTIONS:\n",
" count = emotion_distribution.get(emotion, 0)\n",
" print(f\" {emotion}: {count}\")\n",
" \n",
" return data_list\n",
"\n",
"def load_casia_data(data_path='./CAISA', limit=None):\n",
" \"\"\"\n",
" 加载CASIA中文情感语音数据集\n",
" \n",
" Args:\n",
" data_path: CASIA数据集路径\n",
" limit: 每种情感加载的最大文件数量\n",
" \n",
" Returns:\n",
" data_list: 包含(音频数据, 情感标签)的列表\n",
" \"\"\"\n",
" data_list = []\n",
" \n",
" # 确保路径存在\n",
" if not os.path.exists(data_path):\n",
" print(f\"警告: CASIA数据路径不存在: {data_path}\")\n",
" return data_list\n",
" \n",
" # 计算每种情感的文件数\n",
" emotion_counts = {}\n",
" \n",
" # 获取所有演员目录\n",
" try:\n",
" actors = os.listdir(data_path)\n",
" except Exception as e:\n",
" print(f\"错误: 无法读取CASIA目录: {e}\")\n",
" return data_list\n",
" \n",
" for actor in actors:\n",
" # 跳过隐藏文件或非目录\n",
" actor_path = os.path.join(data_path, actor)\n",
" if not os.path.isdir(actor_path) or actor.startswith('_'):\n",
" continue\n",
" \n",
" emotions = os.listdir(actor_path)\n",
" \n",
" for emotion in emotions:\n",
" # 跳过隐藏文件或非目录\n",
" emotion_path = os.path.join(actor_path, emotion)\n",
" if not os.path.isdir(emotion_path) or emotion.startswith('_'):\n",
" continue\n",
" \n",
" # 检查是否支持该情感\n",
" if emotion in EMOTION_MAPPING:\n",
" mapped_emotion = EMOTION_MAPPING[emotion]\n",
" \n",
" # 只选择我们指定的情感类别\n",
" if mapped_emotion in SELECTED_EMOTIONS:\n",
" audio_files = glob.glob(os.path.join(emotion_path, \"*.wav\"))\n",
" \n",
" for audio_file in audio_files:\n",
" # 检查是否达到了每种情感的限制\n",
" if limit is not None:\n",
" emotion_counts[mapped_emotion] = emotion_counts.get(mapped_emotion, 0) + 1\n",
" if emotion_counts[mapped_emotion] > limit:\n",
" continue\n",
" \n",
" try:\n",
" # 加载音频\n",
" audio, sr = librosa.load(audio_file, sr=SAMPLE_RATE, res_type='kaiser_fast')\n",
" \n",
" # 统一音频长度\n",
" if len(audio) < MAX_SAMPLES:\n",
" # 音频太短,用0填充\n",
" padding = MAX_SAMPLES - len(audio)\n",
" audio = np.pad(audio, (0, padding), 'constant')\n",
" else:\n",
" # 音频太长,截断\n",
" audio = audio[:MAX_SAMPLES]\n",
" \n",
" # 将数据和标签添加到列表\n",
" data_list.append((audio, mapped_emotion))\n",
" \n",
" except Exception as e:\n",
" print(f\"处理文件 {audio_file} 时出错: {e}\")\n",
" \n",
" print(f\"从CASIA数据集加载了 {len(data_list)} 个样本\")\n",
" if limit is not None:\n",
" print(f\"每种情感的限制: {limit}\")\n",
" \n",
" # 打印每种情感的样本数量\n",
" emotion_distribution = {}\n",
" for _, emotion in data_list:\n",
" emotion_distribution[emotion] = emotion_distribution.get(emotion, 0) + 1\n",
" \n",
" print(\"情感分布:\")\n",
" for emotion in SELECTED_EMOTIONS:\n",
" count = emotion_distribution.get(emotion, 0)\n",
" print(f\" {emotion}: {count}\")\n",
" \n",
" return data_list\n",
"\n",
"def load_savee_data(data_path='./SAVEE', limit=None):\n",
" \"\"\"\n",
" 加载SAVEE英文情感语音数据集\n",
" \n",
" Args:\n",
" data_path: SAVEE数据集路径\n",
" limit: 每种情感加载的最大文件数量\n",
" \n",
" Returns:\n",
" data_list: 包含(音频数据, 情感标签)的列表\n",
" \"\"\"\n",
" data_list = []\n",
" \n",
" # 确保路径存在\n",
" if not os.path.exists(data_path):\n",
" print(f\"警告: SAVEE数据路径不存在: {data_path}\")\n",
" return data_list\n",
" \n",
" audio_path = os.path.join(data_path, \"AudioData\")\n",
" if not os.path.exists(audio_path):\n",
" print(f\"警告: SAVEE AudioData路径不存在: {audio_path}\")\n",
" return data_list\n",
" \n",
" # SAVEE数据集中的四个说话者\n",
" actors = ['DC', 'JE', 'JK', 'KL']\n",
" \n",
" # 计算每种情感的文件数\n",
" emotion_counts = {}\n",
" \n",
" for actor in actors:\n",
" actor_path = os.path.join(audio_path, actor)\n",
" if not os.path.isdir(actor_path):\n",
" print(f\"警告: SAVEE目录不存在: {actor_path}\")\n",
" continue\n",
" \n",
" audio_files = glob.glob(os.path.join(actor_path, \"*.wav\"))\n",
" \n",
" for audio_file in audio_files:\n",
" file_name = os.path.basename(audio_file)\n",
" \n",
" # 提取情感标签,SAVEE使用文件名的前1-2个字母作为情感标签\n",
" if file_name.startswith(\"sa\"):\n",
" emotion = \"sa\"\n",
" elif file_name.startswith(\"su\"):\n",
" emotion = \"su\"\n",
" else:\n",
" emotion = file_name[0]\n",
" \n",
" # 检查是否支持该情感\n",
" if emotion in EMOTION_MAPPING:\n",
" mapped_emotion = EMOTION_MAPPING[emotion]\n",
" \n",
" # 只选择我们指定的情感类别\n",
" if mapped_emotion in SELECTED_EMOTIONS:\n",
" # 检查是否达到了每种情感的限制\n",
" if limit is not None:\n",
" emotion_counts[mapped_emotion] = emotion_counts.get(mapped_emotion, 0) + 1\n",
" if emotion_counts[mapped_emotion] > limit:\n",
" continue\n",
" \n",
" try:\n",
" # 加载音频\n",
" audio, sr = librosa.load(audio_file, sr=SAMPLE_RATE, res_type='kaiser_fast')\n",
" \n",
" # 统一音频长度\n",
" if len(audio) < MAX_SAMPLES:\n",
" # 音频太短,用0填充\n",
" padding = MAX_SAMPLES - len(audio)\n",
" audio = np.pad(audio, (0, padding), 'constant')\n",
" else:\n",
" # 音频太长,截断\n",
" audio = audio[:MAX_SAMPLES]\n",
" \n",
" # 将数据和标签添加到列表\n",
" data_list.append((audio, mapped_emotion))\n",
" \n",
" except Exception as e:\n",
" print(f\"处理文件 {audio_file} 时出错: {e}\")\n",
" \n",
" print(f\"从SAVEE数据集加载了 {len(data_list)} 个样本\")\n",
" if limit is not None:\n",
" print(f\"每种情感的限制: {limit}\")\n",
" \n",
" # 打印每种情感的样本数量\n",
" emotion_distribution = {}\n",
" for _, emotion in data_list:\n",
" emotion_distribution[emotion] = emotion_distribution.get(emotion, 0) + 1\n",
" \n",
" print(\"情感分布:\")\n",
" for emotion in SELECTED_EMOTIONS:\n",
" count = emotion_distribution.get(emotion, 0)\n",
" print(f\" {emotion}: {count}\")\n",
" \n",
" return data_list\n",
"\n",
"# 加载数据\n",
"try:\n",
" print(\"正在加载音频数据...\")\n",
" \n",
" # 每个情感类别最多加载50个样本(为了平衡数据集)\n",
" samples_per_emotion = 50\n",
" \n",
" # 加载RAVDESS数据集\n",
" ravdess_data = load_ravdess_data(limit=samples_per_emotion)\n",
" \n",
" # 加载CASIA数据集\n",
" casia_data = load_casia_data(limit=samples_per_emotion)\n",
" \n",
" # 加载SAVEE数据集\n",
" savee_data = load_savee_data(limit=samples_per_emotion)\n",
" \n",
" # 合并所有数据\n",
" all_data = ravdess_data + casia_data + savee_data\n",
" \n",
" if not all_data:\n",
" raise ValueError(\"没有加载到任何有效数据。请检查数据路径是否正确。\")\n",
" \n",
" print(f\"总共加载了 {len(all_data)} 个样本\")\n",
" \n",
" # 提取音频和标签\n",
" audios, emotions = zip(*all_data)\n",
" \n",
" # 检查是否所有选定的情感类别都有样本\n",
" unique_emotions = set(emotions)\n",
" missing_emotions = [e for e in SELECTED_EMOTIONS if e not in unique_emotions]\n",
" if missing_emotions:\n",
" print(f\"警告:以下情感类别在数据中缺失: {missing_emotions}\")\n",
" print(\"将使用随机生成的音频数据补充缺失的情感类别\")\n",
" \n",
" # 为每个缺失的情感类别生成随机样本\n",
" for emotion in missing_emotions:\n",
" # 为了保持平衡,每个缺失类别添加5个样本\n",
" for i in range(5):\n",
" # 生成随机音频数据\n",
" random_audio = np.random.randn(MAX_SAMPLES)\n",
" audios = audios + (random_audio,)\n",
" emotions = emotions + (emotion,)\n",
" \n",
" # 提取特征\n",
" print(\"\\n正在提取特征...\")\n",
" features_list = []\n",
" for i, audio in enumerate(tqdm(audios, desc=\"提取特征\")):\n",
" features = extract_all_features(audio)\n",
" features_list.append(features)\n",
" \n",
" # 转换为特征矩阵\n",
" X, feature_names = features_to_matrix(features_list)\n",
" y = np.array(emotions)\n",
" \n",
" print(f\"提取了 {X.shape[1]} 个特征\")\n",
" \n",
" # 确保所有选定的情感类别都在标签编码器中\n",
" # 首先创建标签编码器并拟合所有选定的类别\n",
" encoder = LabelEncoder()\n",
" encoder.fit(SELECTED_EMOTIONS) # 确保所有选定的类别都被包含\n",
" \n",
" # 划分数据集 - 使用分层抽样确保每个类别在各集合中的比例一致\n",
" print(\"\\n划分数据集...\")\n",
" # 首先划分训练+验证集和测试集\n",
" X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
" \n",
" # 然后划分训练集和验证集\n",
" X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)\n",
" \n",
" print(f\"训练集: {X_train.shape[0]} 样本\")\n",
" print(f\"验证集: {X_val.shape[0]} 样本\")\n",
" print(f\"测试集: {X_test.shape[0]} 样本\")\n",
" \n",
" # 检查每个数据集中的情感分布\n",
" print(\"\\n训练集情感分布:\")\n",
" for emotion in SELECTED_EMOTIONS:\n",
" count = np.sum(y_train == emotion)\n",
" print(f\" {emotion}: {count}\")\n",
" \n",
" print(\"\\n验证集情感分布:\")\n",
" for emotion in SELECTED_EMOTIONS:\n",
" count = np.sum(y_val == emotion)\n",
" print(f\" {emotion}: {count}\")\n",
" \n",
" print(\"\\n测试集情感分布:\")\n",
" for emotion in SELECTED_EMOTIONS:\n",
" count = np.sum(y_test == emotion)\n",
" print(f\" {emotion}: {count}\")\n",
" \n",
" # 标准化特征\n",
" print(\"\\n标准化特征...\")\n",
" X_train_norm, X_val_norm, X_test_norm, scaler = normalize_features(X_train, X_val, X_test)\n",
" \n",
" # 重塑为LSTM输入格式\n",
" print(\"重塑为LSTM输入格式...\")\n",
" X_train_reshaped = reshape_for_lstm(X_train_norm)\n",
" X_val_reshaped = reshape_for_lstm(X_val_norm)\n",
" X_test_reshaped = reshape_for_lstm(X_test_norm)\n",
" \n",
" # 使用预先创建的编码器转换标签\n",
" print(\"编码标签...\")\n",
" y_train_encoded = encoder.transform(y_train)\n",
" y_val_encoded = encoder.transform(y_val)\n",
" y_test_encoded = encoder.transform(y_test)\n",
" \n",
" # 转换为独热编码\n",
" y_train_categorical = to_categorical(y_train_encoded, num_classes=len(SELECTED_EMOTIONS))\n",
" y_val_categorical = to_categorical(y_val_encoded, num_classes=len(SELECTED_EMOTIONS))\n",
" y_test_categorical = to_categorical(y_test_encoded, num_classes=len(SELECTED_EMOTIONS))\n",
" \n",
" print(f\"类别映射: {dict(zip(encoder.classes_, range(len(encoder.classes_))))}\")\n",
" \n",
" # 打印数据形状\n",
" print(f\"\\nLSTM输入形状:\")\n",
" print(f\"训练集: {X_train_reshaped.shape}\")\n",
" print(f\"验证集: {X_val_reshaped.shape}\")\n",
" print(f\"测试集: {X_test_reshaped.shape}\")\n",
" print(f\"训练标签: {y_train_categorical.shape}\")\n",
" \n",
"except Exception as e:\n",
" print(f\"加载数据时出错: {e}\")\n",
" import traceback\n",
" traceback.print_exc()\n"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"正在加载音频数据...\n",
"加载 angry 情感的数据...\n",
"加载 fear 情感的数据...\n",
"加载 happy 情感的数据...\n",
"加载 neutral 情感的数据...\n",
"加载 sad 情感的数据...\n",
"加载 surprise 情感的数据...\n",
"CASIA数据集: 共加载 300 个样本\n",
"RAVDESS数据集: 共加载 300 个样本\n",
" neutral: 50 个样本\n",
" happy: 50 个样本\n",
" sad: 50 个样本\n",
" angry: 50 个样本\n",
" fear: 50 个样本\n",
" surprise: 50 个样本\n",
"成功加载 600 个真实音频样本\n",
"从音频数据中提取特征...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"提取特征: 100%|██████████| 600/600 [00:21<00:00, 28.04it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"划分数据集...\n",
"训练集: 360 个样本\n",
"验证集: 120 个样本\n",
"测试集: 120 个样本\n",
"标准化特征...\n",
"重塑为LSTM输入格式...\n",
"对标签进行编码...\n",
"情感类别: ['angry' 'fear' 'happy' 'neutral' 'sad' 'surprise']\n",
"数据准备完成,可以用于模型训练\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# 加载数据的函数\n",
"\n",
"def load_casia_data(casia_path=\"./CAISA\", limit=None):\n",
" \"\"\"\n",
" 加载CASIA中文情感语音数据集\n",
" \n",
" Args:\n",
" casia_path: CASIA数据集路径\n",
" limit: 每种情感加载的最大样本数\n",
" \n",
" Returns:\n",
" data_list: 包含(音频数据, 情感标签)的列表\n",
" \"\"\"\n",
" data_list = []\n",
" \n",
" # 确保路径存在\n",
" if not os.path.exists(casia_path):\n",
" print(f\"警告: CASIA数据路径不存在: {casia_path}\")\n",
" return data_list\n",
" \n",
" # 定义情感标签\n",
" emotions = ['angry', 'fear', 'happy', 'neutral', 'sad', 'surprise']\n",
" \n",
" # 加载每种情感的数据\n",
" for emotion in emotions:\n",
" print(f\"加载 {emotion} 情感的数据...\")\n",
" \n",
" # 获取所有演员的目录\n",
" actors = []\n",
" for item in os.listdir(casia_path):\n",
" if os.path.isdir(os.path.join(casia_path, item)) and not item.startswith('_'):\n",
" actors.append(item)\n",
" \n",
" count = 0\n",
" for actor in actors:\n",
" # 检查该演员是否有该情感的目录\n",
" emotion_path = os.path.join(casia_path, actor, emotion)\n",
" if not os.path.exists(emotion_path):\n",
" continue\n",
" \n",
" # 获取该情感下的所有音频文件\n",
" audio_files = glob.glob(os.path.join(emotion_path, \"*.wav\"))\n",
" \n",
" # 如果有限制,只加载部分文件\n",
" if limit and count >= limit:\n",
" break\n",
" \n",
" for audio_file in audio_files:\n",
" try:\n",
" # 加载音频文件\n",
" audio, sr = librosa.load(audio_file, sr=SAMPLE_RATE)\n",
" \n",
" # 统一音频长度\n",
" if len(audio) < MAX_SAMPLES:\n",
" # 音频太短,用0填充\n",
" padding = MAX_SAMPLES - len(audio)\n",
" audio = np.pad(audio, (0, padding), 'constant')\n",
" else:\n",
" # 音频太长,截断\n",
" audio = audio[:MAX_SAMPLES]\n",
" \n",
" # 添加到数据列表\n",
" data_list.append((audio, emotion))\n",
" count += 1\n",
" \n",
" # 如果达到限制,跳出循环\n",
" if limit and count >= limit:\n",
" break\n",
" except Exception as e:\n",
" print(f\"处理文件 {audio_file} 时出错: {e}\")\n",
" \n",
" print(f\"CASIA数据集: 共加载 {len(data_list)} 个样本\")\n",
" return data_list\n",
"\n",
"def load_ravdess_data(ravdess_path=\"./RAVDESS\", limit=None):\n",
" \"\"\"\n",
" 加载RAVDESS英文情感语音数据集\n",
" \n",
" Args:\n",
" ravdess_path: RAVDESS数据集路径\n",
" limit: 每种情感加载的最大样本数\n",
" \n",
" Returns:\n",
" data_list: 包含(音频数据, 情感标签)的列表\n",
" \"\"\"\n",
" data_list = []\n",
" \n",
" # 确保路径存在\n",
" if not os.path.exists(ravdess_path):\n",
" print(f\"警告: RAVDESS数据路径不存在: {ravdess_path}\")\n",
" return data_list\n",
" \n",
" # RAVDESS情感标签映射\n",
" emotion_map = {\n",
" '01': 'neutral',\n",
" '03': 'happy',\n",
" '04': 'sad',\n",
" '05': 'angry',\n",
" '06': 'fear',\n",
" '08': 'surprise'\n",
" }\n",
" \n",
" # 只管这些情感类别(与CASIA一致)\n",
" target_emotions = ['neutral', 'happy', 'sad', 'angry', 'fear', 'surprise']\n",
" \n",
" # 每种情感的计数\n",
" emotion_counts = {emotion: 0 for emotion in target_emotions}\n",
" \n",
" try:\n",
" # 获取所有目录\n",
" actor_dirs = glob.glob(os.path.join(ravdess_path, \"Actor_*\"))\n",
" \n",
" for actor_dir in actor_dirs:\n",
" # 获取该目录的所有音频文件\n",
" audio_files = glob.glob(os.path.join(actor_dir, \"*.wav\"))\n",
" \n",
" for audio_file in audio_files:\n",
" # RAVDESS文件名格式: 03-01-05-01-02-01-12.wav\n",
" # 05 表示情感类别\n",
" filename = os.path.basename(audio_file)\n",
" parts = filename.split('-')\n",
" \n",
" if len(parts) >= 3:\n",
" emotion_code = parts[2]\n",
" \n",
" if emotion_code in emotion_map:\n",
" emotion = emotion_map[emotion_code]\n",
" \n",
" if emotion in target_emotions:\n",
" # 检查是否达到了该情感的样本数限制\n",
" if limit and emotion_counts[emotion] >= limit:\n",
" continue\n",
" \n",
" try:\n",
" # 加载音频文件\n",
" audio, sr = librosa.load(audio_file, sr=SAMPLE_RATE)\n",
" \n",
" # 统一音频长度\n",
" if len(audio) < MAX_SAMPLES:\n",
" padding = MAX_SAMPLES - len(audio)\n",
" audio = np.pad(audio, (0, padding), 'constant')\n",
" else:\n",
" audio = audio[:MAX_SAMPLES]\n",
" \n",
" # 添加到数据列表\n",
" data_list.append((audio, emotion))\n",
" emotion_counts[emotion] += 1\n",
" except Exception as e:\n",
" print(f\"处理文件 {audio_file} 时出错: {e}\")\n",
" \n",
" print(f\"RAVDESS数据集: 共加载 {len(data_list)} 个样本\")\n",
"\n",
" for emotion, count in emotion_counts.items():\n",
" print(f\" {emotion}: {count} 个样本\")\n",
" \n",
" except Exception as e:\n",
" print(f\"错误: 无法获取RAVDESS演员目录: {e}\")\n",
" \n",
" return data_list\n",
"\n",
"# 尝试加载数据\n",
"try:\n",
" print(\"正在加载音频数据...\")\n",
" \n",
" # 每种情感的样本数限制\n",
" samples_per_emotion = 50 # 可以根据需要调整\n",
" \n",
" # 加载CASIA数据集\n",
" casia_data = load_casia_data(limit=samples_per_emotion)\n",
" \n",
" # 加载RAVDESS数据集\n",
" ravdess_data = load_ravdess_data(limit=samples_per_emotion)\n",
" \n",
" # 合并数据\n",
" all_data = casia_data + ravdess_data\n",
" \n",
" if all_data:\n",
" print(f\"成功加载 {len(all_data)} 个真实音频样本\")\n",
" \n",
" # 分离特征和标签\n",
" X_real = [item[0] for item in all_data]\n",
" y_real = [item[1] for item in all_data]\n",
" \n",
" # 提取特征\n",
" print(\"从音频数据中提取特征...\")\n",
" features_list = []\n",
" for audio in tqdm(X_real, desc=\"提取特征\"):\n",
" features = extract_all_features(audio)\n",
" features_list.append(features)\n",
" \n",
" # 转换为特征矩阵\n",
" X, feature_names = features_to_matrix(features_list)\n",
" \n",
" # 划分数据集\n",
" print(\"划分数据集...\")\n",
" X_train_val, X_test, y_train_val, y_test = train_test_split(X, y_real, test_size=0.2, random_state=42, stratify=y_real)\n",
" X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)\n",
" \n",
" print(f\"训练集: {len(X_train)} 个样本\")\n",
" print(f\"验证集: {len(X_val)} 个样本\")\n",
" print(f\"测试集: {len(X_test)} 个样本\")\n",
" \n",
" # 标准化特征\n",
" print(\"标准化特征...\")\n",
" X_train_norm, X_val_norm, X_test_norm, scaler = normalize_features(X_train, X_val, X_test)\n",
" \n",
" # 重塑为LSTM输入格式\n",
" print(\"重塑为LSTM输入格式...\")\n",
" X_train_reshaped = reshape_for_lstm(X_train_norm)\n",
" X_val_reshaped = reshape_for_lstm(X_val_norm)\n",
" X_test_reshaped = reshape_for_lstm(X_test_norm)\n",
" \n",
" # 对情感标签进行编码\n",
" print(\"对标签进行编码...\")\n",
" encoder = LabelEncoder()\n",
" encoder.fit(y_train + y_val + y_test)\n",
" \n",
" print(f\"情感类别: {encoder.classes_}\")\n",
" \n",
" y_train_encoded = encoder.transform(y_train)\n",
" y_val_encoded = encoder.transform(y_val)\n",
" y_test_encoded = encoder.transform(y_test)\n",
" \n",
" # 转换为独热编码\n",
" y_train_categorical = to_categorical(y_train_encoded)\n",
" y_val_categorical = to_categorical(y_val_encoded)\n",
" y_test_categorical = to_categorical(y_test_encoded)\n",
" \n",
" print(\"数据准备完成,可以用于模型训练\")\n",
" else:\n",
" print(\"没有加载到任何音频数据,将使用演示数据\")\n",
" # 使用前面创建的演示数据\n",
" # 这里不需要任何操作,因为之前已经创建了演示数据\n",
"\n",
"except Exception as e:\n",
" print(f\"加载数据时出错: {e}\")\n",
"\n"
]
},
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": [
"## 2.6 特征数据处理工具函数\n",
"\n",
"这一节我们实现用于特征处理的工具函数:\n",
"1. 综合特征提取函数\n",
"2. 特征矩阵转换\n",
"3. 特征标准化\n",
"4. LSTM数据格式转换\n"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"真实数据集统计信息:\n",
"特征数量: 21\n",
"前5个特征: ['pitch_mean', 'pitch_std', 'pitch_max', 'pitch_min', 'tuning_offset']\n",
"\n",
"训练集类别分布:\n",
" angry: 60\n",
" fear: 60\n",
" happy: 60\n",
" neutral: 60\n",
" sad: 60\n",
" surprise: 60\n",
"\n",
"第一个样本的特征统计:\n",
" 均值: 0.2349\n",
" 标准差: 0.8315\n",
" 最小值: -0.8931\n",
" 最大值: 1.6344\n"
]
}
],
"source": [
"def extract_all_features(audio, sr=SAMPLE_RATE):\n",
" \"\"\"\n",
" 综合特征提取函数,提取所有音频特征\n",
" \n",
" Args:\n",
" audio: 音频信号\n",
" sr: 采样率\n",
" \n",
" Returns:\n",
" features: 特征字典\n",
" \"\"\"\n",
" features = {}\n",
" \n",
" # 1. 提取音高特征\n",
" pitch_features = extract_pitch_features(audio, sr)\n",
" features.update(pitch_features)\n",
" \n",
" # 2. 提取调谐偏差特征\n",
" tuning_features = extract_tuning_features(audio, sr)\n",
" features.update(tuning_features)\n",
" \n",
" # 3. 提取频谱质心特征\n",
" centroid_features = extract_spectral_centroid_features(audio, sr)\n",
" features.update(centroid_features)\n",
" \n",
" # 4. 提取频谱平坦度特征\n",
" flatness_features = extract_spectral_flatness_features(audio, sr)\n",
" features.update(flatness_features)\n",
" \n",
" # 5. 提取MFCC特征\n",
" mfcc_features = extract_mfcc_features(audio, sr)\n",
" features.update(mfcc_features)\n",
" \n",
" # 6. 提取光谱对比度特征\n",
" contrast_features = extract_spectral_contrast_features(audio, sr)\n",
" features.update(contrast_features)\n",
" \n",
" # 7. 提取RMS和ZCR特征\n",
" rms_features = extract_rms_features(audio, sr)\n",
" features.update(rms_features)\n",
" \n",
" # 8. 提取色谱图特征\n",
" chroma_features = extract_chroma_features(audio, sr)\n",
" features.update(chroma_features)\n",
" \n",
" # 9. 添加统计矩\n",
" features['audio_mean'] = np.mean(audio)\n",
" features['audio_std'] = np.std(audio)\n",
" features['audio_skew'] = np.mean((audio - np.mean(audio))**3) / (np.std(audio)**3) if np.std(audio) > 0 else 0\n",
" features['audio_kurtosis'] = np.mean((audio - np.mean(audio))**4) / (np.std(audio)**4) - 3 if np.std(audio) > 0 else 0\n",
" \n",
" return features\n",
"\n",
"def features_to_matrix(features_list, feature_names=None):\n",
" \"\"\"\n",
" 将特征字典列表转换为特征矩阵\n",
" \n",
" Args:\n",
" features_list: 特征字典列表\n",
" feature_names: 特征名称列表,如果为None则从第一个非空字典中获取\n",
" \n",
" Returns:\n",
" X: 特征矩阵\n",
" feature_names: 特征名称列表\n",
" \"\"\"\n",
" # 如果没有提供特征名称,从第一个非空字典中获取\n",
" if feature_names is None:\n",
" for features in features_list:\n",
" if features: # 非空字典\n",
" feature_names = list(features.keys())\n",
" break\n",
" \n",
" if feature_names is None:\n",
" raise ValueError(\"所有特征字典都是空的,无法确定特征名称\")\n",
" \n",
" # 创建特征矩阵\n",
" X = np.zeros((len(features_list), len(feature_names)))\n",
" \n",
" for i, features in enumerate(features_list):\n",
" if not features: # 空字典\n",
" # 填充为0,或者可以使用平均值等\n",
" continue\n",
" \n",
" for j, name in enumerate(feature_names):\n",
" if name in features:\n",
" X[i, j] = features[name]\n",
" \n",
" return X, feature_names\n",
"\n",
"def normalize_features(X_train, X_val=None, X_test=None):\n",
" \"\"\"\n",
" 标准化特征\n",
" \n",
" Args:\n",
" X_train: 训练集特征矩阵\n",
" X_val: 验证集特征矩阵\n",
" X_test: 测试集特征矩阵\n",
" \n",
" Returns:\n",
" X_train_norm: 标准化后的训练集\n",
" X_val_norm: 标准化后的验证集 (如果提供)\n",
" X_test_norm: 标准化后的测试集 (如果提供)\n",
" scaler: 标准化器对象\n",
" \"\"\"\n",
" # 初始化标准化器\n",
" scaler = StandardScaler()\n",
" \n",
" # 使用训练集拟合标准化器\n",
" X_train_norm = scaler.fit_transform(X_train)\n",
" \n",
" # 标准化验证集和测试集 (如果提供)\n",
" results = [X_train_norm]\n",
" \n",
" if X_val is not None:\n",
" X_val_norm = scaler.transform(X_val)\n",
" results.append(X_val_norm)\n",
" \n",
" if X_test is not None:\n",
" X_test_norm = scaler.transform(X_test)\n",
" results.append(X_test_norm)\n",
" \n",
" # 添加标准化器到结果\n",
" results.append(scaler)\n",
" \n",
" return tuple(results)\n",
"\n",
"def reshape_for_lstm(X):\n",
" \"\"\"\n",
" 将二维特征矩阵重塑为LSTM所需的三维格式\n",
" \n",
" Args:\n",
" X: 二维特征矩阵,形状为 (n_samples, n_features)\n",
" \n",
" Returns:\n",
" X_reshaped: 重塑后的三维特征张量,形状为 (n_samples, 1, n_features)\n",
" \"\"\"\n",
" # LSTM输入形状为: (n_samples, time_steps, n_features)\n",
" # 我们使用time_steps=1,因为我们处理的是静态特征\n",
" return X.reshape(X.shape[0], 1, X.shape[1])\n",
"\n",
"# 工具函数代码(这些函数已经在前面定义并用于数据处理)\n",
"\n",
"# 打印已加载数据的一些统计信息\n",
"try:\n",
" if 'X_train_reshaped' in locals() and 'y_train_categorical' in locals():\n",
" print(\"\\n真实数据集统计信息:\")\n",
" print(f\"特征数量: {X_train_reshaped.shape[2]}\")\n",
" \n",
" # 打印前5个特征名称\n",
" if 'feature_names' in locals() and feature_names:\n",
" print(f\"前5个特征: {feature_names[:5]}\")\n",
" \n",
" # 打印类别分布\n",
" if 'y_train' in locals() and 'encoder' in locals():\n",
" print(\"\\n训练集类别分布:\")\n",
" for emotion, count in zip(*np.unique(y_train, return_counts=True)):\n",
" print(f\" {emotion}: {count}\")\n",
" \n",
" # 打印一个样本的特征统计信息\n",
" if len(X_train_reshaped) > 0:\n",
" print(\"\\n第一个样本的特征统计:\")\n",
" print(f\" 均值: {np.mean(X_train_reshaped[0]):.4f}\")\n",
" print(f\" 标准差: {np.std(X_train_reshaped[0]):.4f}\")\n",
" print(f\" 最小值: {np.min(X_train_reshaped[0]):.4f}\")\n",
" print(f\" 最大值: {np.max(X_train_reshaped[0]):.4f}\")\n",
" else:\n",
" print(\"数据尚未加载,无法显示统计信息\")\n",
"except Exception as e:\n",
" print(f\"显示统计信息时出错: {e}\")\n"
]
},
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": [
"# 3. 分类预测模型构建与分析\n",
"\n",
"在这一部分,我们将构建和训练LSTM神经网络模型来进行情感分类,并评估模型性能。\n"
]
},
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": [
"## 3.1 构建LSTM神经网络模型\n",
"\n",
"LSTM (Long Short-Term Memory) 是一种特殊的循环神经网络 (RNN),能够有效地处理序列数据,特别适合用于音频情感分析。在这一节中,我们将构建一个双向LSTM模型。\n"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"输入形状: (1, 21)\n",
"情感类别数: 6\n",
"训练标签形状: (360, 6)\n",
"验证标签形状: (120, 6)\n",
"测试标签形状: (120, 6)\n"
]
},
{
"data": {
"text/html": [
"Model: \"sequential_1\"\n",
"\n"
],
"text/plain": [
"\u001b[1mModel: \"sequential_1\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃ Layer (type) ┃ Output Shape ┃ Param # ┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ bidirectional_2 (Bidirectional) │ (None, 1, 256) │ 153,600 │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ batch_normalization_4 │ (None, 1, 256) │ 1,024 │\n",
"│ (BatchNormalization) │ │ │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dropout_4 (Dropout) │ (None, 1, 256) │ 0 │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ bidirectional_3 (Bidirectional) │ (None, 256) │ 394,240 │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ batch_normalization_5 │ (None, 256) │ 1,024 │\n",
"│ (BatchNormalization) │ │ │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dropout_5 (Dropout) │ (None, 256) │ 0 │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_3 (Dense) │ (None, 128) │ 32,896 │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ batch_normalization_6 │ (None, 128) │ 512 │\n",
"│ (BatchNormalization) │ │ │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dropout_6 (Dropout) │ (None, 128) │ 0 │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_4 (Dense) │ (None, 64) │ 8,256 │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ batch_normalization_7 │ (None, 64) │ 256 │\n",
"│ (BatchNormalization) │ │ │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dropout_7 (Dropout) │ (None, 64) │ 0 │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_5 (Dense) │ (None, 6) │ 390 │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
"\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
"│ bidirectional_2 (\u001b[38;5;33mBidirectional\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m153,600\u001b[0m │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ batch_normalization_4 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m1,024\u001b[0m │\n",
"│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dropout_4 (\u001b[38;5;33mDropout\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ bidirectional_3 (\u001b[38;5;33mBidirectional\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m394,240\u001b[0m │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ batch_normalization_5 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m1,024\u001b[0m │\n",
"│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dropout_5 (\u001b[38;5;33mDropout\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_3 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m32,896\u001b[0m │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ batch_normalization_6 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m512\u001b[0m │\n",
"│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dropout_6 (\u001b[38;5;33mDropout\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_4 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m8,256\u001b[0m │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ batch_normalization_7 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m256\u001b[0m │\n",
"│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dropout_7 (\u001b[38;5;33mDropout\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │\n",
"├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
"│ dense_5 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m6\u001b[0m) │ \u001b[38;5;34m390\u001b[0m │\n",
"└─────────────────────────────────┴────────────────────────┴───────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Total params: 592,198 (2.26 MB)\n",
"\n"
],
"text/plain": [
"\u001b[1m Total params: \u001b[0m\u001b[38;5;34m592,198\u001b[0m (2.26 MB)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Trainable params: 590,790 (2.25 MB)\n",
"\n"
],
"text/plain": [
"\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m590,790\u001b[0m (2.25 MB)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" Non-trainable params: 1,408 (5.50 KB)\n",
"\n"
],
"text/plain": [
"\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m1,408\u001b[0m (5.50 KB)\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"训练参数:\n",
"LSTM单元数: 128\n",
"Dropout率: 0.3\n",
"正则化系数: 0.001\n",
"学习率: 0.001\n"
]
}
],
"source": [
"class EmotionModel:\n",
" \"\"\"\n",
" 语音情感识别模型类\n",
" \"\"\"\n",
" \n",
" def __init__(self, input_shape, num_emotions):\n",
" \"\"\"\n",
" 初始化模型\n",
" \n",
" Args:\n",
" input_shape: 输入特征的形状,例如 (1, 193) 表示每个样本有1个时间步和193个特征\n",
" num_emotions: 情感类别数量\n",
" \"\"\"\n",
" self.input_shape = input_shape\n",
" self.num_emotions = num_emotions\n",
" self.model = None\n",
" \n",
" def build_model(self, lstm_units=128, dropout_rate=0.3, regularization_rate=0.001):\n",
" \"\"\"\n",
" 构建LSTM模型\n",
" \n",
" Args:\n",
" lstm_units: LSTM层的单元数\n",
" dropout_rate: Dropout层的丢弃率\n",
" regularization_rate: L2正则化系数\n",
" \n",
" Returns:\n",
" 构建好的模型\n",
" \"\"\"\n",
" from tensorflow.keras.models import Sequential\n",
" from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional\n",
" from tensorflow.keras.regularizers import l2\n",
" \n",
" # 定义正则化器\n",
" regularizer = l2(regularization_rate)\n",
" \n",
" # 构建模型\n",
" self.model = Sequential([\n",
" # 第一个双向LSTM层,返回序列\n",
" Bidirectional(LSTM(lstm_units, return_sequences=True, \n",
" kernel_regularizer=regularizer,\n",
" recurrent_regularizer=regularizer), \n",
" input_shape=self.input_shape),\n",
" BatchNormalization(),\n",
" Dropout(dropout_rate),\n",
" \n",
" # 第二个双向LSTM层,不返回序列\n",
" Bidirectional(LSTM(lstm_units, return_sequences=False, \n",
" kernel_regularizer=regularizer,\n",
" recurrent_regularizer=regularizer)),\n",
" BatchNormalization(),\n",
" Dropout(dropout_rate),\n",
" \n",
" # 全连接层\n",
" Dense(128, activation='relu', kernel_regularizer=regularizer),\n",
" BatchNormalization(),\n",
" Dropout(dropout_rate),\n",
" \n",
" Dense(64, activation='relu', kernel_regularizer=regularizer),\n",
" BatchNormalization(),\n",
" Dropout(dropout_rate),\n",
" \n",
" # 输出层\n",
" Dense(self.num_emotions, activation='softmax')\n",
" ])\n",
" \n",
" return self.model\n",
" \n",
" def compile_model(self, learning_rate=0.001):\n",
" \"\"\"\n",
" 编译模型\n",
" \n",
" Args:\n",
" learning_rate: 学习率\n",
" \"\"\"\n",
" from tensorflow.keras.optimizers import Adam\n",
" \n",
" optimizer = Adam(learning_rate=learning_rate)\n",
" self.model.compile(\n",
" optimizer=optimizer,\n",
" loss='categorical_crossentropy',\n",
" metrics=['accuracy']\n",
" )\n",
" \n",
" def get_callbacks(self, checkpoint_path=None):\n",
" \"\"\"\n",
" 获取回调函数列表\n",
" \n",
" Args:\n",
" checkpoint_path: 模型检查点保存路径\n",
" \n",
" Returns:\n",
" callbacks: 回调函数列表\n",
" \"\"\"\n",
" from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau\n",
" \n",
" callbacks = []\n",
" \n",
" # 提前停止\n",
" early_stopping = EarlyStopping(\n",
" monitor='val_loss',\n",
" patience=10,\n",
" restore_best_weights=True\n",
" )\n",
" callbacks.append(early_stopping)\n",
" \n",
" # 学习率衰减\n",
" reduce_lr = ReduceLROnPlateau(\n",
" monitor='val_loss',\n",
" factor=0.2,\n",
" patience=5,\n",
" min_lr=1e-6\n",
" )\n",
" callbacks.append(reduce_lr)\n",
" \n",
" # 模型检查点\n",
" if checkpoint_path:\n",
" checkpoint = ModelCheckpoint(\n",
" filepath=checkpoint_path,\n",
" monitor='val_loss',\n",
" save_best_only=True,\n",
" save_weights_only=False,\n",
" mode='min',\n",
" verbose=1\n",
" )\n",
" callbacks.append(checkpoint)\n",
" \n",
" return callbacks\n",
" \n",
" def train(self, X_train, y_train, X_val, y_val, epochs=100, batch_size=32, checkpoint_path=None):\n",
" \"\"\"\n",
" 训练模型\n",
" \n",
" Args:\n",
" X_train: 训练集特征\n",
" y_train: 训练集标签\n",
" X_val: 验证集特征\n",
" y_val: 验证集标签\n",
" epochs: 训练轮数\n",
" batch_size: 批量大小\n",
" checkpoint_path: 模型检查点保存路径\n",
" \n",
" Returns:\n",
" 训练历史\n",
" \"\"\"\n",
" callbacks = self.get_callbacks(checkpoint_path)\n",
" \n",
" history = self.model.fit(\n",
" X_train, y_train,\n",
" validation_data=(X_val, y_val),\n",
" epochs=epochs,\n",
" batch_size=batch_size,\n",
" callbacks=callbacks,\n",
" verbose=1\n",
" )\n",
" \n",
" return history\n",
" \n",
" def evaluate(self, X_test, y_test):\n",
" \"\"\"\n",
" 评估模型\n",
" \n",
" Args:\n",
" X_test: 测试集特征\n",
" y_test: 测试集标签\n",
" \n",
" Returns:\n",
" 评估结果(损失和准确率)\n",
" \"\"\"\n",
" return self.model.evaluate(X_test, y_test)\n",
" \n",
" def predict(self, X):\n",
" \"\"\"\n",
" 预测情感\n",
" \n",
" Args:\n",
" X: 特征\n",
" \n",
" Returns:\n",
" 预测结果\n",
" \"\"\"\n",
" return self.model.predict(X)\n",
" \n",
" def save(self, filepath):\n",
" \"\"\"\n",
" 保存模型\n",
" \n",
" Args:\n",
" filepath: 保存路径\n",
" \"\"\"\n",
" self.model.save(filepath)\n",
" \n",
" @classmethod\n",
" def load(cls, filepath):\n",
" \"\"\"\n",
" 加载模型\n",
" \n",
" Args:\n",
" filepath: 模型文件路径\n",
" \n",
" Returns:\n",
" 加载的模型\n",
" \"\"\"\n",
" from tensorflow.keras.models import load_model\n",
" \n",
" model_instance = cls(input_shape=None, num_emotions=None)\n",
" model_instance.model = load_model(filepath)\n",
" \n",
" return model_instance\n",
"\n",
"# 使用数据构建和编译模型\n",
"try:\n",
" if 'X_train_reshaped' in locals() and 'y_train_categorical' in locals():\n",
" # 获取输入形状和类别数\n",
" time_steps = X_train_reshaped.shape[1] # 时间步数\n",
" features = X_train_reshaped.shape[2] # 特征数\n",
" num_emotions = len(SELECTED_EMOTIONS) # 情感类别数\n",
" \n",
" print(f\"输入形状: ({time_steps}, {features})\")\n",
" print(f\"情感类别数: {num_emotions}\")\n",
" print(f\"训练标签形状: {y_train_categorical.shape}\")\n",
" print(f\"验证标签形状: {y_val_categorical.shape}\")\n",
" print(f\"测试标签形状: {y_test_categorical.shape}\")\n",
" \n",
" # 检查所有标签集的类别数是否一致\n",
" if y_train_categorical.shape[1] != y_val_categorical.shape[1] or y_train_categorical.shape[1] != y_test_categorical.shape[1]:\n",
" print(\"警告: 不同数据集的类别数不一致!\")\n",
" \n",
" # 设置训练参数\n",
" lstm_units = 128\n",
" dropout_rate = 0.3\n",
" regularization_rate = 0.001\n",
" learning_rate = 0.001\n",
" \n",
" # 创建模型,确保使用正确的类别数\n",
" emotion_model = EmotionModel(\n",
" input_shape=(time_steps, features),\n",
" num_emotions=num_emotions\n",
" )\n",
" \n",
" # 构建模型\n",
" model = emotion_model.build_model(\n",
" lstm_units=lstm_units,\n",
" dropout_rate=dropout_rate,\n",
" regularization_rate=regularization_rate\n",
" )\n",
" \n",
" # 编译模型\n",
" emotion_model.compile_model(learning_rate=learning_rate)\n",
" \n",
" # 显示模型摘要\n",
" model.summary()\n",
" \n",
" # 打印训练参数\n",
" print(\"\\n训练参数:\")\n",
" print(f\"LSTM单元数: {lstm_units}\")\n",
" print(f\"Dropout率: {dropout_rate}\")\n",
" print(f\"正则化系数: {regularization_rate}\")\n",
" print(f\"学习率: {learning_rate}\")\n",
" else:\n",
" print(\"数据尚未加载,无法构建模型\")\n",
"except Exception as e:\n",
" print(f\"构建模型时出错: {e}\")\n"
]
},
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": [
"## 3.2 模型训练与评估代码\n",
"\n",
"在这一节中,我们将使用数据集训练模型,并评估模型性能。\n"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"# 修复分类报告中的类别不匹配问题\n",
"def evaluate_model(y_true, y_pred, class_names):\n",
" \"\"\"\n",
" 评估模型性能\n",
" \n",
" Args:\n",
" y_true: 标签(独热编码)\n",
" y_pred: 预测标签(概率值)\n",
" class_names: 类别名称\n",
" \n",
" Returns:\n",
" 评估结果字典\n",
" \"\"\"\n",
" # 将独热编码转换为类别索引\n",
" y_true_classes = np.argmax(y_true, axis=1)\n",
" y_pred_classes = np.argmax(y_pred, axis=1)\n",
" \n",
" # 获取实际出现在数据中的类别索引\n",
" present_class_indices = np.unique(np.concatenate([y_true_classes, y_pred_classes]))\n",
" print(f\"测试集中实际出现的类别索引: {present_class_indices}\")\n",
" \n",
" # 获取实际出现的类别名称\n",
" present_class_names = [class_names[i] for i in present_class_indices]\n",
" print(f\"测试集中实际出现的类别: {present_class_names}\")\n",
" \n",
" # 检查是否有缺失的类别\n",
" all_class_indices = set(range(len(class_names)))\n",
" missing_indices = all_class_indices - set(present_class_indices)\n",
" if missing_indices:\n",
" missing_classes = [class_names[i] for i in missing_indices]\n",
" print(f\"警告: 测试集中缺少以下类别: {missing_classes}\")\n",
" print(\"这可能是由于数据集划分不均衡或数据集太小所致\")\n",
" \n",
" # 计算混淆矩阵 - 仅使用实际出现的类别\n",
" cm = confusion_matrix(y_true_classes, y_pred_classes, \n",
" labels=present_class_indices)\n",
" \n",
" # 计算准确率\n",
" accuracy = np.sum(y_true_classes == y_pred_classes) / len(y_true_classes)\n",
" \n",
" # 获取分类报告 - 仅使用实际出现的类别\n",
" report = classification_report(y_true_classes, y_pred_classes, \n",
" labels=present_class_indices,\n",
" target_names=present_class_names,\n",
" output_dict=True)\n",
" \n",
" # 返回结果,包括实际使用的类别信息\n",
" return {\n",
" 'confusion_matrix': cm,\n",
" 'accuracy': accuracy,\n",
" 'report': report,\n",
" 'present_class_indices': present_class_indices,\n",
" 'present_class_names': present_class_names\n",
" }\n",
"\n",
"# 修复绘制混淆矩阵函数,以适应可能存在的类别缺失\n",
"def plot_confusion_matrix(y_true, y_pred, class_names, save_path=None, normalize=False, present_class_indices=None, present_class_names=None):\n",
" \"\"\"\n",
" 绘制混淆矩阵\n",
" \n",
" Args:\n",
" y_true: 标签\n",
" y_pred: 预测标签\n",
" class_names: 类别名称\n",
" save_path: 保存路径\n",
" normalize: 是否归一化\n",
" present_class_indices: 实际存在的类别索引\n",
" present_class_names: 实际存在的类别名称\n",
" \"\"\"\n",
" # 如果提供了实际存在的类别信息,则使用它们\n",
" labels_to_use = present_class_indices if present_class_indices is not None else None\n",
" display_names = present_class_names if present_class_names is not None else class_names\n",
" \n",
" # 计算混淆矩阵\n",
" cm = confusion_matrix(y_true, y_pred, labels=labels_to_use)\n",
" \n",
" # 归一化混淆矩阵\n",
" if normalize:\n",
" cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
" fmt = '.2f'\n",
" else:\n",
" fmt = 'd'\n",
" \n",
" plt.figure(figsize=(10, 8))\n",
" sns.heatmap(cm, annot=True, fmt=fmt, cmap='Blues', \n",
" xticklabels=display_names, \n",
" yticklabels=display_names)\n",
" plt.xlabel('预测标签')\n",
" plt.ylabel('标签')\n",
" plt.title('混淆矩阵')\n",
" plt.tight_layout()\n",
" \n",
" if save_path:\n",
" plt.savefig(save_path)\n",
" \n",
" plt.show()\n",
"\n",
"# 修改打印评估结果函数,以显示缺失类别信息\n",
"def print_evaluation_results(results, emotion_classes):\n",
" \"\"\"\n",
" 打印评估结果\n",
" \n",
" Args:\n",
" results: 评估结果字典\n",
" emotion_classes: 所有情感类别\n",
" \"\"\"\n",
" print(f\"准确率: {results['accuracy']:.4f}\")\n",
" \n",
" # 打印缺失的类别信息\n",
" all_class_indices = set(range(len(emotion_classes)))\n",
" missing_indices = all_class_indices - set(results.get('present_class_indices', all_class_indices))\n",
" if missing_indices:\n",
" missing_classes = [emotion_classes[i] for i in missing_indices]\n",
" print(f\"\\n警告: 测试集中缺少以下类别: {missing_classes}\")\n",
" print(\"这可能是由于数据集划分不均衡或数据集太小所致\")\n",
" \n",
" print(\"\\n分类报告:\")\n",
" report = results['report']\n",
" \n",
" # 打印每个类别的指标\n",
" for class_name in sorted(report.keys()):\n",
" if class_name not in ['accuracy', 'macro avg', 'weighted avg']:\n",
" metrics = report[class_name]\n",
" print(f\"{class_name:>10}: 精确率={metrics['precision']:.4f}, 召回率={metrics['recall']:.4f}, F1分数={metrics['f1-score']:.4f}\")\n",
" \n",
" # 打印平均指标\n",
" print(\"\\n平均指标:\")\n",
" print(f\"宏平均: 精确率={report['macro avg']['precision']:.4f}, 召回率={report['macro avg']['recall']:.4f}, F1分数={report['macro avg']['f1-score']:.4f}\")\n",
" print(f\"加权平均: 精确率={report['weighted avg']['precision']:.4f}, 召回率={report['weighted avg']['recall']:.4f}, F1分数={report['weighted avg']['f1-score']:.4f}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"在测试集上预测:\n",
"\u001b[1m4/4\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 147ms/step\n",
"测试集中实际出现的类别索引: [0 1 2 3 4 5]\n",
"测试集中实际出现的类别: [np.str_('angry'), np.str_('fear'), np.str_('happy'), np.str_('neutral'), np.str_('sad'), np.str_('surprise')]\n",
"准确率: 0.1833\n",
"\n",
"分类报告:\n",
" angry: 精确率=0.0000, 召回率=0.0000, F1分数=0.0000\n",
" fear: 精确率=0.1887, 召回率=1.0000, F1分数=0.3175\n",
" happy: 精确率=0.1111, 召回率=0.0500, F1分数=0.0690\n",
" neutral: 精确率=0.2500, 召回率=0.0500, F1分数=0.0833\n",
" sad: 精确率=0.0000, 召回率=0.0000, F1分数=0.0000\n",
" surprise: 精确率=0.0000, 召回率=0.0000, F1分数=0.0000\n",
"\n",
"平均指标:\n",
"宏平均: 精确率=0.0916, 召回率=0.1833, F1分数=0.0783\n",
"加权平均: 精确率=0.0916, 召回率=0.1833, F1分数=0.0783\n",
"评估模型时出错: name 'sns' is not defined\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Traceback (most recent call last):\n",
" File \"C:\\Users\\lenovo\\AppData\\Local\\Temp\\ipykernel_11496\\64319893.py\", line 22, in \n",
" plot_confusion_matrix(\n",
" File \"C:\\Users\\lenovo\\AppData\\Local\\Temp\\ipykernel_11496\\4080349739.py\", line 85, in plot_confusion_matrix\n",
" sns.heatmap(cm, annot=True, fmt=fmt, cmap='Blues',\n",
" ^^^\n",
"NameError: name 'sns' is not defined\n"
]
},
{
"data": {
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 重新进行评估和绘制,使用修复后的函数\n",
"try:\n",
" # 在测试集上预测\n",
" print(\"在测试集上预测:\")\n",
" y_pred = emotion_model.model.predict(X_test_reshaped)\n",
" \n",
" # 获取类别标签\n",
" y_true_classes = np.argmax(y_test_categorical, axis=1)\n",
" y_pred_classes = np.argmax(y_pred, axis=1)\n",
" \n",
" # 保存标签和预测标签,用于后续分析\n",
" np.save('./output/emotion_model/y_true.npy', y_true_classes)\n",
" np.save('./output/emotion_model/y_pred.npy', y_pred_classes)\n",
" \n",
" # 评估模型\n",
" results = evaluate_model(y_test_categorical, y_pred, emotion_classes)\n",
" \n",
" # 打印评估结果\n",
" print_evaluation_results(results, emotion_classes)\n",
" \n",
" # 绘制混淆矩阵\n",
" plot_confusion_matrix(\n",
" y_true_classes, \n",
" y_pred_classes, \n",
" emotion_classes, \n",
" save_path='./output/emotion_model/confusion_matrix.png',\n",
" present_class_indices=results.get('present_class_indices'),\n",
" present_class_names=results.get('present_class_names')\n",
" )\n",
" \n",
"except Exception as e:\n",
" print(f\"评估模型时出错: {e}\")\n",
" import traceback\n",
" traceback.print_exc()\n"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"使用RAVDESS音频文件进行测试: ./RAVDESS/Actor_01/03-01-01-01-01-01-01.wav\n",
"\n",
"音高帧级数据 (前10行):\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" frame | \n",
" time | \n",
" pitch | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0.00000 | \n",
" 2458.481445 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 0.02322 | \n",
" 182.268219 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" 0.04644 | \n",
" 190.799026 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3 | \n",
" 0.06966 | \n",
" 197.138947 | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
" 0.09288 | \n",
" 423.822876 | \n",
"
\n",
" \n",
" | 5 | \n",
" 5 | \n",
" 0.11610 | \n",
" 166.867813 | \n",
"
\n",
" \n",
" | 6 | \n",
" 6 | \n",
" 0.13932 | \n",
" 172.440811 | \n",
"
\n",
" \n",
" | 7 | \n",
" 7 | \n",
" 0.16254 | \n",
" 168.360764 | \n",
"
\n",
" \n",
" | 8 | \n",
" 8 | \n",
" 0.18576 | \n",
" 148.656494 | \n",
"
\n",
" \n",
" | 9 | \n",
" 9 | \n",
" 0.20898 | \n",
" 151.040848 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" frame time pitch\n",
"0 0 0.00000 2458.481445\n",
"1 1 0.02322 182.268219\n",
"2 2 0.04644 190.799026\n",
"3 3 0.06966 197.138947\n",
"4 4 0.09288 423.822876\n",
"5 5 0.11610 166.867813\n",
"6 6 0.13932 172.440811\n",
"7 7 0.16254 168.360764\n",
"8 8 0.18576 148.656494\n",
"9 9 0.20898 151.040848"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"音高和调谐统计特征:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pitch_mean | \n",
" pitch_std | \n",
" pitch_max | \n",
" pitch_min | \n",
" pitch_range | \n",
" pitch_median | \n",
" tuning_offset | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 573.830444 | \n",
" 853.130066 | \n",
" 3988.073975 | \n",
" 147.432693 | \n",
" 3840.641357 | \n",
" 234.112061 | \n",
" -0.17 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pitch_mean pitch_std pitch_max pitch_min pitch_range pitch_median \\\n",
"0 573.830444 853.130066 3988.073975 147.432693 3840.641357 234.112061 \n",
"\n",
" tuning_offset \n",
"0 -0.17 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"音高四分位数统计:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pitch_q1 | \n",
" pitch_median | \n",
" pitch_q3 | \n",
" pitch_iqr | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 173.241608 | \n",
" 234.112061 | \n",
" 488.627747 | \n",
" 315.386139 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pitch_q1 pitch_median pitch_q3 pitch_iqr\n",
"0 173.241608 234.112061 488.627747 315.386139"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 测试音高和调谐偏差特征提取,以DataFrame形式展示\n",
"try:\n",
" # 加载一个示例音频文件\n",
" ravdess_file = './RAVDESS/Actor_01/03-01-01-01-01-01-01.wav'\n",
" if os.path.exists(ravdess_file):\n",
" print(f\"使用RAVDESS音频文件进行测试: {ravdess_file}\")\n",
" audio, sr = librosa.load(ravdess_file, sr=SAMPLE_RATE, res_type='kaiser_fast')\n",
" \n",
" # 统一音频长度\n",
" if len(audio) < MAX_SAMPLES:\n",
" # 音频太短,用0填充\n",
" padding = MAX_SAMPLES - len(audio)\n",
" audio = np.pad(audio, (0, padding), 'constant')\n",
" else:\n",
" # 音频太长,截断\n",
" audio = audio[:MAX_SAMPLES]\n",
" \n",
" # 创建一个存储音高和调谐特征的DataFrame\n",
" pitch_df = pd.DataFrame()\n",
" \n",
" # 提取音高特征\n",
" pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)\n",
" \n",
" # 提取每帧最大幅度对应的音高\n",
" pitch_values = []\n",
" times_list = []\n",
" frame_indices = []\n",
" \n",
" for t in range(pitches.shape[1]):\n",
" idx = np.argmax(magnitudes[:, t])\n",
" pitch = pitches[idx, t]\n",
" time = t * 512 / sr # 计算时间点\n",
" \n",
" if pitch > 0: # 过滤掉静音帧\n",
" pitch_values.append(pitch)\n",
" times_list.append(time)\n",
" frame_indices.append(t)\n",
" \n",
" # 创建音高帧级DataFrame\n",
" pitch_frames_df = pd.DataFrame({\n",
" 'frame': frame_indices,\n",
" 'time': times_list,\n",
" 'pitch': pitch_values\n",
" })\n",
" \n",
" # 计算音高统计特征\n",
" pitch_stats = {\n",
" 'pitch_mean': np.mean(pitch_values) if pitch_values else 0,\n",
" 'pitch_std': np.std(pitch_values) if len(pitch_values) > 1 else 0,\n",
" 'pitch_max': np.max(pitch_values) if pitch_values else 0,\n",
" 'pitch_min': np.min(pitch_values) if pitch_values else 0,\n",
" 'pitch_range': np.ptp(pitch_values) if pitch_values else 0, # 峰峰值\n",
" 'pitch_median': np.median(pitch_values) if pitch_values else 0\n",
" }\n",
" \n",
" # 提取调谐偏差\n",
" tuning_offset = librosa.estimate_tuning(y=audio, sr=sr)\n",
" pitch_stats['tuning_offset'] = tuning_offset\n",
" \n",
" # 创建统计特征DataFrame\n",
" pitch_stats_df = pd.DataFrame([pitch_stats])\n",
" \n",
" # 展示结果\n",
" print(\"\\n音高帧级数据 (前10行):\")\n",
" display(pitch_frames_df.head(10))\n",
" \n",
" print(\"\\n音高和调谐统计特征:\")\n",
" display(pitch_stats_df)\n",
" \n",
" # 计算音高四分位数\n",
" q1 = np.percentile(pitch_values, 25) if pitch_values else 0\n",
" q3 = np.percentile(pitch_values, 75) if pitch_values else 0\n",
" iqr = q3 - q1\n",
" \n",
" quartile_stats = {\n",
" 'pitch_q1': q1,\n",
" 'pitch_median': np.median(pitch_values) if pitch_values else 0,\n",
" 'pitch_q3': q3,\n",
" 'pitch_iqr': iqr\n",
" }\n",
" \n",
" print(\"\\n音高四分位数统计:\")\n",
" display(pd.DataFrame([quartile_stats]))\n",
" \n",
" else:\n",
" print(\"RAVDESS音频示例文件不存在\")\n",
" \n",
"except Exception as e:\n",
" print(f\"提取特征时出错: {e}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"使用RAVDESS音频文件进行测试: ./RAVDESS/Actor_01/03-01-01-01-01-01-01.wav\n",
"\n",
"音高帧级数据 (前10行):\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" frame | \n",
" time | \n",
" pitch | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0.00000 | \n",
" 2458.481445 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 0.02322 | \n",
" 182.268219 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" 0.04644 | \n",
" 190.799026 | \n",
"
\n",
" \n",
" | 3 | \n",
" 3 | \n",
" 0.06966 | \n",
" 197.138947 | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
" 0.09288 | \n",
" 423.822876 | \n",
"
\n",
" \n",
" | 5 | \n",
" 5 | \n",
" 0.11610 | \n",
" 166.867813 | \n",
"
\n",
" \n",
" | 6 | \n",
" 6 | \n",
" 0.13932 | \n",
" 172.440811 | \n",
"
\n",
" \n",
" | 7 | \n",
" 7 | \n",
" 0.16254 | \n",
" 168.360764 | \n",
"
\n",
" \n",
" | 8 | \n",
" 8 | \n",
" 0.18576 | \n",
" 148.656494 | \n",
"
\n",
" \n",
" | 9 | \n",
" 9 | \n",
" 0.20898 | \n",
" 151.040848 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" frame time pitch\n",
"0 0 0.00000 2458.481445\n",
"1 1 0.02322 182.268219\n",
"2 2 0.04644 190.799026\n",
"3 3 0.06966 197.138947\n",
"4 4 0.09288 423.822876\n",
"5 5 0.11610 166.867813\n",
"6 6 0.13932 172.440811\n",
"7 7 0.16254 168.360764\n",
"8 8 0.18576 148.656494\n",
"9 9 0.20898 151.040848"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"音高和调谐统计特征:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pitch_mean | \n",
" pitch_std | \n",
" pitch_max | \n",
" pitch_min | \n",
" pitch_range | \n",
" pitch_median | \n",
" tuning_offset | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 573.830444 | \n",
" 853.130066 | \n",
" 3988.073975 | \n",
" 147.432693 | \n",
" 3840.641357 | \n",
" 234.112061 | \n",
" -0.17 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pitch_mean pitch_std pitch_max pitch_min pitch_range pitch_median \\\n",
"0 573.830444 853.130066 3988.073975 147.432693 3840.641357 234.112061 \n",
"\n",
" tuning_offset \n",
"0 -0.17 "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"音高四分位数统计:\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pitch_q1 | \n",
" pitch_median | \n",
" pitch_q3 | \n",
" pitch_iqr | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 173.241608 | \n",
" 234.112061 | \n",
" 488.627747 | \n",
" 315.386139 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pitch_q1 pitch_median pitch_q3 pitch_iqr\n",
"0 173.241608 234.112061 488.627747 315.386139"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 测试音高和调谐偏差特征提取,以DataFrame形式展示\n",
"try:\n",
" # 加载一个示例音频文件\n",
" ravdess_file = './RAVDESS/Actor_01/03-01-01-01-01-01-01.wav'\n",
" if os.path.exists(ravdess_file):\n",
" print(f\"使用RAVDESS音频文件进行测试: {ravdess_file}\")\n",
" audio, sr = librosa.load(ravdess_file, sr=SAMPLE_RATE, res_type='kaiser_fast')\n",
" \n",
" # 统一音频长度\n",
" if len(audio) < MAX_SAMPLES:\n",
" # 音频太短,用0填充\n",
" padding = MAX_SAMPLES - len(audio)\n",
" audio = np.pad(audio, (0, padding), 'constant')\n",
" else:\n",
" # 音频太长,截断\n",
" audio = audio[:MAX_SAMPLES]\n",
" \n",
" # 创建一个存储音高和调谐特征的DataFrame\n",
" pitch_df = pd.DataFrame()\n",
" \n",
" # 提取音高特征\n",
" pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)\n",
" \n",
" # 提取每帧最大幅度对应的音高\n",
" pitch_values = []\n",
" times_list = []\n",
" frame_indices = []\n",
" \n",
" for t in range(pitches.shape[1]):\n",
" idx = np.argmax(magnitudes[:, t])\n",
" pitch = pitches[idx, t]\n",
" time = t * 512 / sr # 计算时间点\n",
" \n",
" if pitch > 0: # 过滤掉静音帧\n",
" pitch_values.append(pitch)\n",
" times_list.append(time)\n",
" frame_indices.append(t)\n",
" \n",
" # 创建音高帧级DataFrame\n",
" pitch_frames_df = pd.DataFrame({\n",
" 'frame': frame_indices,\n",
" 'time': times_list,\n",
" 'pitch': pitch_values\n",
" })\n",
" \n",
" # 计算音高统计特征\n",
" pitch_stats = {\n",
" 'pitch_mean': np.mean(pitch_values) if pitch_values else 0,\n",
" 'pitch_std': np.std(pitch_values) if len(pitch_values) > 1 else 0,\n",
" 'pitch_max': np.max(pitch_values) if pitch_values else 0,\n",
" 'pitch_min': np.min(pitch_values) if pitch_values else 0,\n",
" 'pitch_range': np.ptp(pitch_values) if pitch_values else 0, # 峰峰值\n",
" 'pitch_median': np.median(pitch_values) if pitch_values else 0\n",
" }\n",
" \n",
" # 提取调谐偏差\n",
" tuning_offset = librosa.estimate_tuning(y=audio, sr=sr)\n",
" pitch_stats['tuning_offset'] = tuning_offset\n",
" \n",
" # 创建统计特征DataFrame\n",
" pitch_stats_df = pd.DataFrame([pitch_stats])\n",
" \n",
" # 展示结果\n",
" print(\"\\n音高帧级数据 (前10行):\")\n",
" display(pitch_frames_df.head(10))\n",
" \n",
" print(\"\\n音高和调谐统计特征:\")\n",
" display(pitch_stats_df)\n",
" \n",
" # 计算音高四分位数\n",
" q1 = np.percentile(pitch_values, 25) if pitch_values else 0\n",
" q3 = np.percentile(pitch_values, 75) if pitch_values else 0\n",
" iqr = q3 - q1\n",
" \n",
" quartile_stats = {\n",
" 'pitch_q1': q1,\n",
" 'pitch_median': np.median(pitch_values) if pitch_values else 0,\n",
" 'pitch_q3': q3,\n",
" 'pitch_iqr': iqr\n",
" }\n",
" \n",
" print(\"\\n音高四分位数统计:\")\n",
" display(pd.DataFrame([quartile_stats]))\n",
" \n",
" else:\n",
" print(\"RAVDESS音频示例文件不存在\")\n",
" \n",
"except Exception as e:\n",
" print(f\"提取特征时出错: {e}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"开始使用真实数据训练模型...\n",
"训练参数:\n",
" 轮次: 100\n",
" 批量大小: 32\n",
" 检查点路径: ./output\\emotion_model\\best_model_weights.h5\n",
"Epoch 1/100\n",
"\u001b[1m10/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.1881 - loss: 4.1832 \n",
"Epoch 1: val_loss improved from inf to 3.07267, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 77ms/step - accuracy: 0.1933 - loss: 4.1151 - val_accuracy: 0.2583 - val_loss: 3.0727 - learning_rate: 0.0010\n",
"Epoch 2/100\n",
"\u001b[1m10/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.3231 - loss: 3.1472 \n",
"Epoch 2: val_loss improved from 3.07267 to 2.91173, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 19ms/step - accuracy: 0.3228 - loss: 3.1418 - val_accuracy: 0.2000 - val_loss: 2.9117 - learning_rate: 0.0010\n",
"Epoch 3/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.3529 - loss: 2.9806 \n",
"Epoch 3: val_loss improved from 2.91173 to 2.81711, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 19ms/step - accuracy: 0.3504 - loss: 2.9690 - val_accuracy: 0.1667 - val_loss: 2.8171 - learning_rate: 0.0010\n",
"Epoch 4/100\n",
"\u001b[1m10/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.3511 - loss: 2.7555 \n",
"Epoch 4: val_loss improved from 2.81711 to 2.76079, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 20ms/step - accuracy: 0.3508 - loss: 2.7371 - val_accuracy: 0.1667 - val_loss: 2.7608 - learning_rate: 0.0010\n",
"Epoch 5/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.3762 - loss: 2.5691 \n",
"Epoch 5: val_loss improved from 2.76079 to 2.72165, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 19ms/step - accuracy: 0.3851 - loss: 2.5434 - val_accuracy: 0.1667 - val_loss: 2.7217 - learning_rate: 0.0010\n",
"Epoch 6/100\n",
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 10ms/step - accuracy: 0.4150 - loss: 2.5097\n",
"Epoch 6: val_loss improved from 2.72165 to 2.70396, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 24ms/step - accuracy: 0.4151 - loss: 2.5085 - val_accuracy: 0.1583 - val_loss: 2.7040 - learning_rate: 0.0010\n",
"Epoch 7/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.4134 - loss: 2.5004 \n",
"Epoch 7: val_loss improved from 2.70396 to 2.69049, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 22ms/step - accuracy: 0.4237 - loss: 2.4482 - val_accuracy: 0.1750 - val_loss: 2.6905 - learning_rate: 0.0010\n",
"Epoch 8/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.5066 - loss: 2.1765 \n",
"Epoch 8: val_loss improved from 2.69049 to 2.67515, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 20ms/step - accuracy: 0.4988 - loss: 2.2100 - val_accuracy: 0.2333 - val_loss: 2.6752 - learning_rate: 0.0010\n",
"Epoch 9/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.4397 - loss: 2.1924 \n",
"Epoch 9: val_loss improved from 2.67515 to 2.66241, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 20ms/step - accuracy: 0.4442 - loss: 2.1894 - val_accuracy: 0.2667 - val_loss: 2.6624 - learning_rate: 0.0010\n",
"Epoch 10/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.4683 - loss: 2.1495 \n",
"Epoch 10: val_loss improved from 2.66241 to 2.65176, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 19ms/step - accuracy: 0.4700 - loss: 2.1500 - val_accuracy: 0.2833 - val_loss: 2.6518 - learning_rate: 0.0010\n",
"Epoch 11/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.4583 - loss: 2.0871 \n",
"Epoch 11: val_loss improved from 2.65176 to 2.63942, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 19ms/step - accuracy: 0.4561 - loss: 2.1086 - val_accuracy: 0.2833 - val_loss: 2.6394 - learning_rate: 0.0010\n",
"Epoch 12/100\n",
"\u001b[1m11/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━\u001b[0m \u001b[1m0s\u001b[0m 11ms/step - accuracy: 0.4799 - loss: 2.1424\n",
"Epoch 12: val_loss improved from 2.63942 to 2.62882, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 22ms/step - accuracy: 0.4839 - loss: 2.1406 - val_accuracy: 0.2750 - val_loss: 2.6288 - learning_rate: 0.0010\n",
"Epoch 13/100\n",
"\u001b[1m11/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━\u001b[0m \u001b[1m0s\u001b[0m 11ms/step - accuracy: 0.5356 - loss: 2.1000\n",
"Epoch 13: val_loss improved from 2.62882 to 2.61788, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 21ms/step - accuracy: 0.5361 - loss: 2.0992 - val_accuracy: 0.2750 - val_loss: 2.6179 - learning_rate: 0.0010\n",
"Epoch 14/100\n",
"\u001b[1m 6/12\u001b[0m \u001b[32m━━━━━━━━━━\u001b[0m\u001b[37m━━━━━━━━━━\u001b[0m \u001b[1m0s\u001b[0m 10ms/step - accuracy: 0.4660 - loss: 2.0251\n",
"Epoch 14: val_loss improved from 2.61788 to 2.60245, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 25ms/step - accuracy: 0.4793 - loss: 2.0527 - val_accuracy: 0.2833 - val_loss: 2.6024 - learning_rate: 0.0010\n",
"Epoch 15/100\n",
"\u001b[1m10/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━\u001b[0m \u001b[1m0s\u001b[0m 12ms/step - accuracy: 0.5078 - loss: 2.0108\n",
"Epoch 15: val_loss improved from 2.60245 to 2.59021, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 25ms/step - accuracy: 0.5108 - loss: 2.0079 - val_accuracy: 0.2750 - val_loss: 2.5902 - learning_rate: 0.0010\n",
"Epoch 16/100\n",
"\u001b[1m 8/12\u001b[0m \u001b[32m━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━━━\u001b[0m \u001b[1m0s\u001b[0m 8ms/step - accuracy: 0.5107 - loss: 1.9981 \n",
"Epoch 16: val_loss improved from 2.59021 to 2.57797, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 21ms/step - accuracy: 0.5153 - loss: 1.9897 - val_accuracy: 0.2833 - val_loss: 2.5780 - learning_rate: 0.0010\n",
"Epoch 17/100\n",
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 10ms/step - accuracy: 0.5400 - loss: 1.9434\n",
"Epoch 17: val_loss improved from 2.57797 to 2.56796, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 21ms/step - accuracy: 0.5404 - loss: 1.9445 - val_accuracy: 0.2750 - val_loss: 2.5680 - learning_rate: 0.0010\n",
"Epoch 18/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.5527 - loss: 1.9138 \n",
"Epoch 18: val_loss improved from 2.56796 to 2.55026, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 19ms/step - accuracy: 0.5532 - loss: 1.9246 - val_accuracy: 0.3000 - val_loss: 2.5503 - learning_rate: 0.0010\n",
"Epoch 19/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.5570 - loss: 1.9163 \n",
"Epoch 19: val_loss improved from 2.55026 to 2.52697, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 21ms/step - accuracy: 0.5544 - loss: 1.9199 - val_accuracy: 0.3000 - val_loss: 2.5270 - learning_rate: 0.0010\n",
"Epoch 20/100\n",
"\u001b[1m11/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━\u001b[0m \u001b[1m0s\u001b[0m 11ms/step - accuracy: 0.5514 - loss: 1.9147\n",
"Epoch 20: val_loss improved from 2.52697 to 2.49535, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 22ms/step - accuracy: 0.5542 - loss: 1.9113 - val_accuracy: 0.3333 - val_loss: 2.4954 - learning_rate: 0.0010\n",
"Epoch 21/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.5543 - loss: 1.9019 \n",
"Epoch 21: val_loss improved from 2.49535 to 2.46785, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 19ms/step - accuracy: 0.5600 - loss: 1.8988 - val_accuracy: 0.3333 - val_loss: 2.4678 - learning_rate: 0.0010\n",
"Epoch 22/100\n",
"\u001b[1m11/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━\u001b[0m \u001b[1m0s\u001b[0m 11ms/step - accuracy: 0.5654 - loss: 1.9222\n",
"Epoch 22: val_loss improved from 2.46785 to 2.43942, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 28ms/step - accuracy: 0.5681 - loss: 1.9149 - val_accuracy: 0.3417 - val_loss: 2.4394 - learning_rate: 0.0010\n",
"Epoch 23/100\n",
"\u001b[1m10/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.6012 - loss: 1.8228 \n",
"Epoch 23: val_loss improved from 2.43942 to 2.41976, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 17ms/step - accuracy: 0.6049 - loss: 1.8204 - val_accuracy: 0.3667 - val_loss: 2.4198 - learning_rate: 0.0010\n",
"Epoch 24/100\n",
"\u001b[1m10/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.5819 - loss: 1.8635 \n",
"Epoch 24: val_loss improved from 2.41976 to 2.38653, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 18ms/step - accuracy: 0.5889 - loss: 1.8442 - val_accuracy: 0.3500 - val_loss: 2.3865 - learning_rate: 0.0010\n",
"Epoch 25/100\n",
"\u001b[1m10/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.5965 - loss: 1.8145 \n",
"Epoch 25: val_loss improved from 2.38653 to 2.34923, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 18ms/step - accuracy: 0.5983 - loss: 1.8172 - val_accuracy: 0.3583 - val_loss: 2.3492 - learning_rate: 0.0010\n",
"Epoch 26/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.6010 - loss: 1.7406 \n",
"Epoch 26: val_loss improved from 2.34923 to 2.33124, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 20ms/step - accuracy: 0.5984 - loss: 1.7382 - val_accuracy: 0.3667 - val_loss: 2.3312 - learning_rate: 0.0010\n",
"Epoch 27/100\n",
"\u001b[1m 6/12\u001b[0m \u001b[32m━━━━━━━━━━\u001b[0m\u001b[37m━━━━━━━━━━\u001b[0m \u001b[1m0s\u001b[0m 11ms/step - accuracy: 0.6005 - loss: 1.7222\n",
"Epoch 27: val_loss improved from 2.33124 to 2.30899, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 19ms/step - accuracy: 0.6159 - loss: 1.7180 - val_accuracy: 0.3583 - val_loss: 2.3090 - learning_rate: 0.0010\n",
"Epoch 28/100\n",
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 10ms/step - accuracy: 0.6474 - loss: 1.6815\n",
"Epoch 28: val_loss improved from 2.30899 to 2.27995, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 22ms/step - accuracy: 0.6476 - loss: 1.6803 - val_accuracy: 0.3667 - val_loss: 2.2800 - learning_rate: 0.0010\n",
"Epoch 29/100\n",
"\u001b[1m11/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━\u001b[0m \u001b[1m0s\u001b[0m 11ms/step - accuracy: 0.6280 - loss: 1.7451\n",
"Epoch 29: val_loss improved from 2.27995 to 2.27024, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 21ms/step - accuracy: 0.6306 - loss: 1.7372 - val_accuracy: 0.3500 - val_loss: 2.2702 - learning_rate: 0.0010\n",
"Epoch 30/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.6910 - loss: 1.6493 \n",
"Epoch 30: val_loss improved from 2.27024 to 2.24994, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 20ms/step - accuracy: 0.6879 - loss: 1.6493 - val_accuracy: 0.3833 - val_loss: 2.2499 - learning_rate: 0.0010\n",
"Epoch 31/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.7136 - loss: 1.5697 \n",
"Epoch 31: val_loss improved from 2.24994 to 2.24836, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 21ms/step - accuracy: 0.7054 - loss: 1.5845 - val_accuracy: 0.3833 - val_loss: 2.2484 - learning_rate: 0.0010\n",
"Epoch 32/100\n",
"\u001b[1m10/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.6437 - loss: 1.6553 \n",
"Epoch 32: val_loss improved from 2.24836 to 2.22598, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 19ms/step - accuracy: 0.6458 - loss: 1.6541 - val_accuracy: 0.3917 - val_loss: 2.2260 - learning_rate: 0.0010\n",
"Epoch 33/100\n",
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 10ms/step - accuracy: 0.6558 - loss: 1.6250\n",
"Epoch 33: val_loss improved from 2.22598 to 2.19080, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 21ms/step - accuracy: 0.6554 - loss: 1.6239 - val_accuracy: 0.4083 - val_loss: 2.1908 - learning_rate: 0.0010\n",
"Epoch 34/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.6648 - loss: 1.5204 \n",
"Epoch 34: val_loss improved from 2.19080 to 2.16207, saving model to ./output\\emotion_model\\best_model_weights.h5\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 18ms/step - accuracy: 0.6683 - loss: 1.5228 - val_accuracy: 0.4167 - val_loss: 2.1621 - learning_rate: 0.0010\n",
"Epoch 35/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 14ms/step - accuracy: 0.6689 - loss: 1.5494\n",
"Epoch 35: val_loss did not improve from 2.16207\n",
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 22ms/step - accuracy: 0.6724 - loss: 1.5530 - val_accuracy: 0.4083 - val_loss: 2.1839 - learning_rate: 0.0010\n",
"Epoch 36/100\n",
"\u001b[1m11/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━\u001b[0m \u001b[1m0s\u001b[0m 6ms/step - accuracy: 0.6951 - loss: 1.5312 \n",
"Epoch 36: val_loss did not improve from 2.16207\n",
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 13ms/step - accuracy: 0.6937 - loss: 1.5345 - val_accuracy: 0.4083 - val_loss: 2.1734 - learning_rate: 0.0010\n",
"Epoch 37/100\n",
"\u001b[1m 7/12\u001b[0m \u001b[32m━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━━━━━\u001b[0m \u001b[1m0s\u001b[0m 9ms/step - accuracy: 0.6908 - loss: 1.5607 \n",
"Epoch 37: val_loss did not improve from 2.16207\n",
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 15ms/step - accuracy: 0.7032 - loss: 1.5439 - val_accuracy: 0.3917 - val_loss: 2.2004 - learning_rate: 0.0010\n",
"Epoch 38/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.7480 - loss: 1.4373 \n",
"Epoch 38: val_loss did not improve from 2.16207\n",
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 14ms/step - accuracy: 0.7359 - loss: 1.4556 - val_accuracy: 0.3917 - val_loss: 2.1724 - learning_rate: 0.0010\n",
"Epoch 39/100\n",
"\u001b[1m 9/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.7369 - loss: 1.4801 \n",
"Epoch 39: val_loss did not improve from 2.16207\n",
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 14ms/step - accuracy: 0.7298 - loss: 1.4826 - val_accuracy: 0.3750 - val_loss: 2.1938 - learning_rate: 0.0010\n",
"Epoch 40/100\n",
"\u001b[1m 8/12\u001b[0m \u001b[32m━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━━━\u001b[0m \u001b[1m0s\u001b[0m 7ms/step - accuracy: 0.7517 - loss: 1.4243 \n",
"Epoch 40: val_loss did not improve from 2.16207\n",
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 15ms/step - accuracy: 0.7413 - loss: 1.4394 - val_accuracy: 0.4083 - val_loss: 2.1900 - learning_rate: 2.0000e-04\n",
"Epoch 41/100\n",
"\u001b[1m 8/12\u001b[0m \u001b[32m━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━━━\u001b[0m \u001b[1m0s\u001b[0m 8ms/step - accuracy: 0.6834 - loss: 1.4966 \n",
"Epoch 41: val_loss did not improve from 2.16207\n",
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 15ms/step - accuracy: 0.6923 - loss: 1.4886 - val_accuracy: 0.4167 - val_loss: 2.1851 - learning_rate: 2.0000e-04\n",
"Epoch 42/100\n",
"\u001b[1m10/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━\u001b[0m \u001b[1m0s\u001b[0m 12ms/step - accuracy: 0.7367 - loss: 1.4154\n",
"Epoch 42: val_loss did not improve from 2.16207\n",
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 20ms/step - accuracy: 0.7355 - loss: 1.4192 - val_accuracy: 0.4417 - val_loss: 2.1892 - learning_rate: 2.0000e-04\n",
"Epoch 43/100\n",
"\u001b[1m 8/12\u001b[0m \u001b[32m━━━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━━━\u001b[0m \u001b[1m0s\u001b[0m 8ms/step - accuracy: 0.7401 - loss: 1.4367 \n",
"Epoch 43: val_loss did not improve from 2.16207\n",
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 16ms/step - accuracy: 0.7391 - loss: 1.4420 - val_accuracy: 0.4333 - val_loss: 2.1974 - learning_rate: 2.0000e-04\n",
"Epoch 44/100\n",
"\u001b[1m 7/12\u001b[0m \u001b[32m━━━━━━━━━━━\u001b[0m\u001b[37m━━━━━━━━━\u001b[0m \u001b[1m0s\u001b[0m 10ms/step - accuracy: 0.7057 - loss: 1.4840\n",
"Epoch 44: val_loss did not improve from 2.16207\n",
"\u001b[1m12/12\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 17ms/step - accuracy: 0.7197 - loss: 1.4650 - val_accuracy: 0.4167 - val_loss: 2.2100 - learning_rate: 2.0000e-04\n",
"\n",
"训练完成! 耗时: 18.03秒\n",
"\n",
"在测试集上评估模型:\n",
"\u001b[1m4/4\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 9ms/step - accuracy: 0.4150 - loss: 2.1795 \n",
"测试损失: 2.1404\n",
"测试准确率: 0.4333\n",
"\n",
"在测试集上预测:\n",
"\u001b[1m4/4\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 7ms/step \n",
"测试集中实际出现的类别索引: [0 1 2 3 4 5]\n",
"测试集中实际出现的类别: [np.str_('angry'), np.str_('fear'), np.str_('happy'), np.str_('neutral'), np.str_('sad'), np.str_('surprise')]\n",
"准确率: 0.4333\n",
"\n",
"分类报告:\n",
" angry: 精确率=0.7692, 召回率=0.5000, F1分数=0.6061\n",
" fear: 精确率=0.7273, 召回率=0.4000, F1分数=0.5161\n",
" happy: 精确率=0.4000, 召回率=0.4000, F1分数=0.4000\n",
" neutral: 精确率=0.2917, 召回率=0.3500, F1分数=0.3182\n",
" sad: 精确率=0.4444, 召回率=0.4000, F1分数=0.4211\n",
" surprise: 精确率=0.3235, 召回率=0.5500, F1分数=0.4074\n",
"\n",
"平均指标:\n",
"宏平均: 精确率=0.4927, 召回率=0.4333, F1分数=0.4448\n",
"加权平均: 精确率=0.4927, 召回率=0.4333, F1分数=0.4448\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"详细分类报告:\n",
" precision recall f1-score support\n",
"\n",
" angry 0.77 0.50 0.61 20\n",
" fear 0.73 0.40 0.52 20\n",
" happy 0.40 0.40 0.40 20\n",
" neutral 0.29 0.35 0.32 20\n",
" sad 0.44 0.40 0.42 20\n",
" surprise 0.32 0.55 0.41 20\n",
"\n",
" accuracy 0.43 120\n",
" macro avg 0.49 0.43 0.44 120\n",
"weighted avg 0.49 0.43 0.44 120\n",
"\n",
"\n",
"保存模型和相关信息...\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"模型训练和评估完成!所有文件已保存到 './output\\emotion_model' 目录\n"
]
}
],
"source": [
"# 模型训练\n",
"try:\n",
" if 'emotion_model' in locals() and 'X_train_reshaped' in locals() and 'y_train_categorical' in locals():\n",
" # 设置训练参数\n",
" print(\"开始使用真实数据训练模型...\")\n",
" \n",
" # 设置输出目录和检查点路径\n",
" output_dir = './output'\n",
" model_name = 'emotion_model'\n",
" os.makedirs(output_dir, exist_ok=True)\n",
" model_dir = os.path.join(output_dir, model_name)\n",
" os.makedirs(model_dir, exist_ok=True)\n",
" \n",
" checkpoint_path = os.path.join(model_dir, 'best_model_weights.h5')\n",
" \n",
" # 设置训练参数\n",
" epochs = 100 # 与默认值一致\n",
" batch_size = 32 # 与默认值一致\n",
" \n",
" print(f\"训练参数:\")\n",
" print(f\" 轮次: {epochs}\")\n",
" print(f\" 批量大小: {batch_size}\")\n",
" print(f\" 检查点路径: {checkpoint_path}\")\n",
" \n",
" # 记录训练开始时间\n",
" import time\n",
" start_time = time.time()\n",
" \n",
" # 训练模型\n",
" history = emotion_model.train(\n",
" X_train=X_train_reshaped,\n",
" y_train=y_train_categorical,\n",
" X_val=X_val_reshaped,\n",
" y_val=y_val_categorical,\n",
" epochs=epochs,\n",
" batch_size=batch_size,\n",
" checkpoint_path=checkpoint_path\n",
" )\n",
" \n",
" # 计算训练时间\n",
" training_time = time.time() - start_time\n",
" print(f\"\\n训练完成! 耗时: {training_time:.2f}秒\")\n",
" \n",
" # 绘制训练历史\n",
" \n",
" # 在测试集上评估模型\n",
" print(\"\\n在测试集上评估模型:\")\n",
" test_loss, test_acc = emotion_model.evaluate(X_test_reshaped, y_test_categorical)\n",
" print(f\"测试损失: {test_loss:.4f}\")\n",
" print(f\"测试准确率: {test_acc:.4f}\")\n",
" \n",
" # 在测试集上预测\n",
" print(\"\\n在测试集上预测:\")\n",
" y_pred = emotion_model.predict(X_test_reshaped)\n",
" \n",
" # 将独热编码转换回类别\n",
" y_true_classes = np.argmax(y_test_categorical, axis=1)\n",
" y_pred_classes = np.argmax(y_pred, axis=1)\n",
" \n",
" # 保存值和预测值用于后续分析\n",
" np.save(os.path.join(model_dir, 'y_true.npy'), y_true_classes)\n",
" np.save(os.path.join(model_dir, 'y_pred.npy'), y_pred_classes)\n",
" \n",
" # 计算混淆矩阵\n",
" from sklearn.metrics import confusion_matrix, classification_report\n",
" \n",
" # 获取情感类别名称\n",
" emotion_classes = encoder.classes_\n",
" \n",
" # 创建评估函数\n",
" def evaluate_model(y_true, y_pred, class_names):\n",
" \"\"\"\n",
" 评估模型性能\n",
" \n",
" Args:\n",
" y_true: 真实标签(独热编码)\n",
" y_pred: 预测标签(概率值)\n",
" class_names: 类别名称\n",
" \n",
" Returns:\n",
" 评估结果字典\n",
" \"\"\"\n",
" # 将独热编码转换为类别索引\n",
" y_true_classes = np.argmax(y_true, axis=1)\n",
" y_pred_classes = np.argmax(y_pred, axis=1)\n",
" \n",
" # 获取实际出现在数据中的类别索引\n",
" present_class_indices = np.unique(np.concatenate([y_true_classes, y_pred_classes]))\n",
" print(f\"测试集中实际出现的类别索引: {present_class_indices}\")\n",
" \n",
" # 获取实际出现的类别名称\n",
" present_class_names = [class_names[i] for i in present_class_indices]\n",
" print(f\"测试集中实际出现的类别: {present_class_names}\")\n",
" \n",
" # 计算混淆矩阵\n",
" cm = confusion_matrix(y_true_classes, y_pred_classes, \n",
" labels=present_class_indices)\n",
" \n",
" # 计算准确率\n",
" accuracy = np.sum(y_true_classes == y_pred_classes) / len(y_true_classes)\n",
" \n",
" # 获取分类报告\n",
" report = classification_report(y_true_classes, y_pred_classes, \n",
" labels=present_class_indices,\n",
" target_names=present_class_names,\n",
" output_dict=True)\n",
" \n",
" # 返回结果\n",
" return {\n",
" 'confusion_matrix': cm,\n",
" 'accuracy': accuracy,\n",
" 'report': report,\n",
" 'present_class_indices': present_class_indices,\n",
" 'present_class_names': present_class_names\n",
" }\n",
" \n",
" # 打印评估结果\n",
" def print_evaluation_results(results):\n",
" \"\"\"\n",
" 打印评估结果\n",
" \n",
" Args:\n",
" results: 评估结果字典\n",
" \"\"\"\n",
" print(f\"准确率: {results['accuracy']:.4f}\")\n",
" \n",
" # 打印缺失的类别信息\n",
" all_class_indices = set(range(len(emotion_classes)))\n",
" missing_indices = all_class_indices - set(results['present_class_indices'])\n",
" if missing_indices:\n",
" missing_classes = [emotion_classes[i] for i in missing_indices]\n",
" print(f\"\\n警告: 测试集中缺少以下类别: {missing_classes}\")\n",
" print(\"这可能是由于数据集划分不均衡或数据集太小所致\")\n",
" \n",
" print(\"\\n分类报告:\")\n",
" report = results['report']\n",
" \n",
" # 打印每个类别的指标\n",
" for class_name in sorted(report.keys()):\n",
" if class_name not in ['accuracy', 'macro avg', 'weighted avg']:\n",
" metrics = report[class_name]\n",
" print(f\"{class_name:>10}: 精确率={metrics['precision']:.4f}, 召回率={metrics['recall']:.4f}, F1分数={metrics['f1-score']:.4f}\")\n",
" \n",
" # 打印平均指标\n",
" print(\"\\n平均指标:\")\n",
" print(f\"宏平均: 精确率={report['macro avg']['precision']:.4f}, 召回率={report['macro avg']['recall']:.4f}, F1分数={report['macro avg']['f1-score']:.4f}\")\n",
" print(f\"加权平均: 精确率={report['weighted avg']['precision']:.4f}, 召回率={report['weighted avg']['recall']:.4f}, F1分数={report['weighted avg']['f1-score']:.4f}\")\n",
" \n",
" # 计算评估指标\n",
" results = evaluate_model(y_test_categorical, y_pred, emotion_classes)\n",
" print_evaluation_results(results)\n",
" \n",
" # 绘制混淆矩阵\n",
" def plot_confusion_matrix(y_true, y_pred, class_names, save_path=None, normalize=False):\n",
" \"\"\"\n",
" 绘制混淆矩阵\n",
" \n",
" Args:\n",
" y_true: 真实标签(类别索引)\n",
" y_pred: 预测标签(类别索引)\n",
" class_names: 类别名称\n",
" save_path: 保存路径\n",
" normalize: 是否归一化\n",
" \"\"\"\n",
" # 计算混淆矩阵\n",
" cm = confusion_matrix(y_true, y_pred)\n",
" \n",
" # 归一化混淆矩阵\n",
" if normalize:\n",
" cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
" cm = np.round(cm, 2)\n",
" \n",
" plt.figure(figsize=(10, 8))\n",
" plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)\n",
" plt.title('混淆矩阵')\n",
" plt.colorbar()\n",
" tick_marks = np.arange(len(class_names))\n",
" plt.xticks(tick_marks, class_names, rotation=45)\n",
" plt.yticks(tick_marks, class_names)\n",
" \n",
" # 在格子中添加数值\n",
" thresh = cm.max() / 2.\n",
" for i in range(cm.shape[0]):\n",
" for j in range(cm.shape[1]):\n",
" plt.text(j, i, cm[i, j],\n",
" horizontalalignment=\"center\",\n",
" color=\"white\" if cm[i, j] > thresh else \"black\")\n",
" \n",
" plt.tight_layout()\n",
" plt.ylabel('真实标签')\n",
" plt.xlabel('预测标签')\n",
" \n",
" # 如果提供了保存路径,保存图形\n",
" if save_path:\n",
" plt.savefig(save_path)\n",
" \n",
" plt.show()\n",
" \n",
" # 调用函数绘制混淆矩阵\n",
" cm_plot_path = os.path.join(model_dir, 'confusion_matrix.png')\n",
" y_true = np.argmax(y_test_categorical, axis=1)\n",
" y_pred_classes = np.argmax(y_pred, axis=1)\n",
" plot_confusion_matrix(y_true, y_pred_classes, emotion_classes, \n",
" save_path=cm_plot_path, normalize=True)\n",
" \n",
" # 同时保存原始分类报告输出以便查看\n",
" print(\"\\n详细分类报告:\")\n",
" y_true_classes = np.argmax(y_test_categorical, axis=1)\n",
" y_pred_classes = np.argmax(y_pred, axis=1)\n",
" print(classification_report(y_true_classes, y_pred_classes, target_names=emotion_classes))\n",
" \n",
" # 保存模型和相关信息\n",
" print(\"\\n保存模型和相关信息...\")\n",
" \n",
" # 保存完整模型\n",
" emotion_model.save(os.path.join(model_dir, 'emotion_model.h5'))\n",
" \n",
" # 保存标签编码器\n",
" with open(os.path.join(model_dir, 'emotion_encoder.pkl'), 'wb') as f:\n",
" pickle.dump(encoder, f)\n",
" \n",
" # 保存特征缩放器\n",
" with open(os.path.join(model_dir, 'feature_scaler.pkl'), 'wb') as f:\n",
" pickle.dump(scaler, f)\n",
" \n",
" # 保存特征名称\n",
" with open(os.path.join(model_dir, 'feature_names.pkl'), 'wb') as f:\n",
" pickle.dump(feature_names, f)\n",
" \n",
" # 绘制训练历史曲线\n",
" def plot_training_history(history, save_path=None):\n",
" \"\"\"\n",
" 绘制训练历史曲线\n",
" \n",
" Args:\n",
" history: 训练历史\n",
" save_path: 保存路径\n",
" \"\"\"\n",
" plt.figure(figsize=(12, 4))\n",
" \n",
" # 绘制准确率\n",
" plt.subplot(1, 2, 1)\n",
" plt.plot(history.history['accuracy'], label='训练准确率')\n",
" plt.plot(history.history['val_accuracy'], label='验证准确率')\n",
" plt.title('模型准确率')\n",
" plt.xlabel('轮次')\n",
" plt.ylabel('准确率')\n",
" plt.legend(loc='lower right')\n",
" plt.grid(True)\n",
" \n",
" # 绘制损失\n",
" plt.subplot(1, 2, 2)\n",
" plt.plot(history.history['loss'], label='训练损失')\n",
" plt.plot(history.history['val_loss'], label='验证损失')\n",
" plt.title('模型损失')\n",
" plt.xlabel('轮次')\n",
" plt.ylabel('损失')\n",
" plt.legend(loc='upper right')\n",
" plt.grid(True)\n",
" \n",
" plt.tight_layout()\n",
" \n",
" # 如果提供了保存路径,保存图形\n",
" if save_path:\n",
" plt.savefig(save_path)\n",
" \n",
" plt.show()\n",
" \n",
" # 绘制训练历史\n",
" history_plot_path = os.path.join(model_dir, 'training_history.png')\n",
" plot_training_history(history, save_path=history_plot_path)\n",
" \n",
" # 保存训练配置\n",
" config = {\n",
" 'sample_rate': SAMPLE_RATE,\n",
" 'max_duration': MAX_DURATION,\n",
" 'max_samples': MAX_SAMPLES,\n",
" 'input_shape': (time_steps, features),\n",
" 'num_emotions': num_emotions,\n",
" 'lstm_units': lstm_units,\n",
" 'dropout_rate': dropout_rate,\n",
" 'regularization_rate': regularization_rate,\n",
" 'learning_rate': learning_rate,\n",
" 'batch_size': batch_size,\n",
" 'epochs': epochs,\n",
" 'training_time': training_time,\n",
" 'accuracy': float(results['accuracy']),\n",
" 'classes': list(encoder.classes_),\n",
" 'num_features': len(feature_names)\n",
" }\n",
" \n",
" with open(os.path.join(model_dir, 'config.pkl'), 'wb') as f:\n",
" pickle.dump(config, f)\n",
" \n",
" print(f\"模型训练和评估完成!所有文件已保存到 '{model_dir}' 目录\")\n",
" \n",
" else:\n",
" print(\"模型或数据不可用,无法训练模型\")\n",
"except Exception as e:\n",
" print(f\"训练模型时出错: {e}\")\n",
" import traceback\n",
" traceback.print_exc()\n"
]
},
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": [
"## 3.3 模型预测测试\n",
"\n",
"我们将使用训练好的模型对新的音频样本进行情感预测,并可视化结果。\n"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"使用当前会话中的模型进行预测...\n",
"使用测试文件: ./RAVDESS/Actor_12\\03-01-01-01-01-01-12.wav\n",
"预测时出错: name 'extract_mfcc_features' is not defined\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Traceback (most recent call last):\n",
" File \"C:\\Users\\lenovo\\AppData\\Local\\Temp\\ipykernel_11496\\755389603.py\", line 122, in \n",
" predicted_emotion, emotion_probs = predict_emotion(\n",
" ^^^^^^^^^^^^^^^^\n",
" File \"C:\\Users\\lenovo\\AppData\\Local\\Temp\\ipykernel_11496\\755389603.py\", line 31, in predict_emotion\n",
" features = extract_all_features(audio, sr)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"C:\\Users\\lenovo\\AppData\\Local\\Temp\\ipykernel_11496\\3740659840.py\", line 31, in extract_all_features\n",
" mfcc_features = extract_mfcc_features(audio, sr)\n",
" ^^^^^^^^^^^^^^^^^^^^^\n",
"NameError: name 'extract_mfcc_features' is not defined. Did you mean: 'extract_pitch_features'?\n"
]
}
],
"source": [
"# 使用训练好的模型进行预测\n",
"def predict_emotion(audio_file, model, encoder, scaler, feature_names, sr=SAMPLE_RATE):\n",
" \"\"\"\n",
" 使用训练好的模型预测音频情感\n",
" \n",
" Args:\n",
" audio_file: 音频文件路径\n",
" model: 训练好的模型\n",
" encoder: 标签编码器\n",
" scaler: 特征缩放器\n",
" feature_names: 特征名称列表\n",
" sr: 采样率\n",
" \n",
" Returns:\n",
" emotion: 预测的情感\n",
" probabilities: 各情感类别的概率\n",
" \"\"\"\n",
" # 加载音频\n",
" audio, sr = librosa.load(audio_file, sr=sr, res_type='kaiser_fast')\n",
" \n",
" # 统一音频长度\n",
" if len(audio) < MAX_SAMPLES:\n",
" # 音频太短,用0填充\n",
" padding = MAX_SAMPLES - len(audio)\n",
" audio = np.pad(audio, (0, padding), 'constant')\n",
" else:\n",
" # 音频太长,截断\n",
" audio = audio[:MAX_SAMPLES]\n",
" \n",
" # 提取特征\n",
" features = extract_all_features(audio, sr)\n",
" \n",
" # 转换为特征矩阵\n",
" X = np.zeros((1, len(feature_names)))\n",
" for i, name in enumerate(feature_names):\n",
" if name in features:\n",
" X[0, i] = features[name]\n",
" \n",
" # 标准化特征\n",
" X_norm = scaler.transform(X)\n",
" \n",
" # 重塑为LSTM输入格式\n",
" X_reshaped = X_norm.reshape(1, 1, X_norm.shape[1])\n",
" \n",
" # 预测\n",
" predictions = model.predict(X_reshaped)[0]\n",
" \n",
" # 获取预测的情感和概率\n",
" emotion_idx = np.argmax(predictions)\n",
" emotion = encoder.inverse_transform([emotion_idx])[0]\n",
" \n",
" # 创建情感和概率的映射\n",
" emotion_probs = {}\n",
" for i, prob in enumerate(predictions):\n",
" emotion_name = encoder.inverse_transform([i])[0]\n",
" emotion_probs[emotion_name] = prob\n",
" \n",
" return emotion, emotion_probs\n",
"\n",
"# 测试预测功能\n",
"try:\n",
" # 检查模型是否已训练\n",
" if 'emotion_model' in locals() and hasattr(emotion_model, 'model'):\n",
" print(\"使用当前会话中的模型进行预测...\")\n",
" \n",
" # 选择一个测试音频文件\n",
" test_file = None\n",
" \n",
" # 首先尝试从RAVDESS中选择一个测试文件\n",
" ravdess_test_files = [f for f in os.listdir('./RAVDESS/Actor_12') if f.endswith('.wav')]\n",
" if ravdess_test_files:\n",
" test_file = os.path.join('./RAVDESS/Actor_12', ravdess_test_files[0])\n",
" \n",
" # 如果没有RAVDESS文件,尝试从SAVEE中选择\n",
" if not test_file:\n",
" if os.path.exists('./SAVEE/AudioData/DC'):\n",
" savee_test_files = [f for f in os.listdir('./SAVEE/AudioData/DC') if f.endswith('.wav')]\n",
" if savee_test_files:\n",
" test_file = os.path.join('./SAVEE/AudioData/DC', savee_test_files[0])\n",
" \n",
" # 如果没有SAVEE文件,尝试从CASIA中选择\n",
" if not test_file:\n",
" if os.path.exists('./CAISA/liuchanhg/angry'):\n",
" casia_test_files = [f for f in os.listdir('./CAISA/liuchanhg/angry') if f.endswith('.wav')]\n",
" if casia_test_files:\n",
" test_file = os.path.join('./CAISA/liuchanhg/angry', casia_test_files[0])\n",
" \n",
" if test_file:\n",
" print(f\"使用测试文件: {test_file}\")\n",
" \n",
" # 获取文件的真实情感(根据文件路径猜测)\n",
" true_emotion = None\n",
" if 'RAVDESS' in test_file:\n",
" # RAVDESS文件名格式: 03-01-05-01-02-01-12.wav\n",
" file_name = os.path.basename(test_file)\n",
" parts = file_name.split('-')\n",
" if len(parts) >= 3:\n",
" emotion = parts[2]\n",
" if emotion in EMOTION_MAPPING:\n",
" true_emotion = EMOTION_MAPPING[emotion]\n",
" elif 'SAVEE' in test_file:\n",
" # SAVEE文件名格式: a01.wav, sa01.wav, ...\n",
" file_name = os.path.basename(test_file)\n",
" if file_name.startswith(\"sa\"):\n",
" emotion = \"sa\"\n",
" elif file_name.startswith(\"su\"):\n",
" emotion = \"su\"\n",
" else:\n",
" emotion = file_name[0]\n",
" \n",
" if emotion in EMOTION_MAPPING:\n",
" true_emotion = EMOTION_MAPPING[emotion]\n",
" elif 'CAISA' in test_file:\n",
" # CASIA目录结构: ./CAISA/person/emotion/file.wav\n",
" path_parts = test_file.split(os.sep)\n",
" if len(path_parts) >= 3:\n",
" emotion = path_parts[-2]\n",
" if emotion in EMOTION_MAPPING:\n",
" true_emotion = EMOTION_MAPPING[emotion]\n",
" \n",
" # 预测情感\n",
" predicted_emotion, emotion_probs = predict_emotion(\n",
" test_file, \n",
" emotion_model.model, \n",
" encoder, \n",
" scaler, \n",
" feature_names\n",
" )\n",
" \n",
" print(f\"\\n预测情感: {predicted_emotion}\")\n",
" if true_emotion:\n",
" print(f\"真实情感: {true_emotion}\")\n",
" print(f\"预测正确: {predicted_emotion == true_emotion}\")\n",
" \n",
" # 显示各情感概率\n",
" print(\"\\n各情感概率:\")\n",
" for emotion, prob in sorted(emotion_probs.items(), key=lambda x: x[1], reverse=True):\n",
" print(f\" {emotion}: {prob:.4f}\")\n",
" \n",
" # 可视化音频和预测结果\n",
" plt.figure(figsize=(15, 10))\n",
" \n",
" # 加载音频用于可视化\n",
" audio, sr = librosa.load(test_file, sr=SAMPLE_RATE)\n",
" \n",
" # 绘制波形\n",
" plt.subplot(3, 1, 1)\n",
" librosa.display.waveshow(audio, sr=sr)\n",
" plt.title(f'音频波形 - 预测情感: {predicted_emotion}' + \n",
" (f' (真实: {true_emotion})' if true_emotion else ''))\n",
" plt.xlabel('时间 (秒)')\n",
" plt.ylabel('振幅')\n",
" \n",
" # 绘制声谱图\n",
" plt.subplot(3, 1, 2)\n",
" D = librosa.amplitude_to_db(np.abs(librosa.stft(audio)), ref=np.max)\n",
" librosa.display.specshow(D, y_axis='log', x_axis='time', sr=sr)\n",
" plt.title('声谱图')\n",
" plt.colorbar(format='%+2.0f dB')\n",
" \n",
" # 绘制情感概率条形图\n",
" plt.subplot(3, 1, 3)\n",
" emotions = list(emotion_probs.keys())\n",
" probs = list(emotion_probs.values())\n",
" \n",
" # 按概率排序\n",
" sorted_indices = np.argsort(probs)[::-1]\n",
" emotions = [emotions[i] for i in sorted_indices]\n",
" probs = [probs[i] for i in sorted_indices]\n",
" \n",
" bars = plt.bar(emotions, probs, color='skyblue')\n",
" \n",
" # 如果有真实情感,高亮显示\n",
" if true_emotion and true_emotion in emotions:\n",
" true_idx = emotions.index(true_emotion)\n",
" bars[true_idx].set_color('green')\n",
" \n",
" # 高亮显示预测情感\n",
" pred_idx = emotions.index(predicted_emotion)\n",
" bars[pred_idx].set_color('red')\n",
" \n",
" plt.title('情感预测概率')\n",
" plt.xlabel('情感类别')\n",
" plt.ylabel('概率')\n",
" plt.ylim(0, 1)\n",
" \n",
" # 添加数值标签\n",
" for i, v in enumerate(probs):\n",
" plt.text(i, v + 0.02, f'{v:.2f}', ha='center')\n",
" \n",
" plt.tight_layout()\n",
" plt.show()\n",
" else:\n",
" print(\"未找到测试音频文件\")\n",
" \n",
" # 如果当前会话中没有模型,尝试加载保存的模型\n",
" elif os.path.exists('./output/emotion_model/emotion_model.h5'):\n",
" print(\"从保存的文件加载模型...\")\n",
" \n",
" # 加载模型\n",
" from tensorflow.keras.models import load_model\n",
" loaded_model = load_model('./output/emotion_model/emotion_model.h5')\n",
" \n",
" # 加载标签编码器\n",
" with open('./output/emotion_model/emotion_encoder.pkl', 'rb') as f:\n",
" loaded_encoder = pickle.load(f)\n",
" \n",
" # 加载特征缩放器\n",
" with open('./output/emotion_model/feature_scaler.pkl', 'rb') as f:\n",
" loaded_scaler = pickle.load(f)\n",
" \n",
" # 加载特征名称\n",
" with open('./output/emotion_model/feature_names.pkl', 'rb') as f:\n",
" loaded_feature_names = pickle.load(f)\n",
" \n",
" # 选择一个测试音频文件\n",
" test_file = None\n",
" \n",
" # 首先尝试从RAVDESS中选择一个测试文件\n",
" ravdess_test_files = [f for f in os.listdir('./RAVDESS/Actor_01') if f.endswith('.wav')]\n",
" if ravdess_test_files:\n",
" test_file = os.path.join('./RAVDESS/Actor_01', ravdess_test_files[0])\n",
" \n",
" if test_file:\n",
" print(f\"使用测试文件: {test_file}\")\n",
" \n",
" # 预测情感\n",
" predicted_emotion, emotion_probs = predict_emotion(\n",
" test_file, \n",
" loaded_model, \n",
" loaded_encoder, \n",
" loaded_scaler, \n",
" loaded_feature_names\n",
" )\n",
" \n",
" print(f\"\\n预测情感: {predicted_emotion}\")\n",
" \n",
" # 显示各情感概率\n",
" print(\"\\n各情感概率:\")\n",
" for emotion, prob in sorted(emotion_probs.items(), key=lambda x: x[1], reverse=True):\n",
" print(f\" {emotion}: {prob:.4f}\")\n",
" else:\n",
" print(\"未找到测试音频文件\")\n",
" else:\n",
" print(\"未找到训练好的模型,无法进行预测\")\n",
"except Exception as e:\n",
" print(f\"预测时出错: {e}\")\n",
" import traceback\n",
" traceback.print_exc()\n"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"在测试集上预测:\n",
"\u001b[1m4/4\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 10ms/step\n",
"准确率: 0.4333\n",
"\n",
"分类报告:\n",
" angry: 精确率=0.7692, 召回率=0.5000, F1分数=0.6061\n",
" fear: 精确率=0.7273, 召回率=0.4000, F1分数=0.5161\n",
" happy: 精确率=0.4000, 召回率=0.4000, F1分数=0.4000\n",
" neutral: 精确率=0.2917, 召回率=0.3500, F1分数=0.3182\n",
" sad: 精确率=0.4444, 召回率=0.4000, F1分数=0.4211\n",
" surprise: 精确率=0.3235, 召回率=0.5500, F1分数=0.4074\n",
"\n",
"平均指标:\n",
"宏平均: 精确率=0.4927, 召回率=0.4333, F1分数=0.4448\n",
"加权平均: 精确率=0.4927, 召回率=0.4333, F1分数=0.4448\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 在测试集上进行预测和评估\n",
"try:\n",
" # 导入必要的模块\n",
" from sklearn.metrics import confusion_matrix, classification_report, accuracy_score\n",
" \n",
" # 加载模型\n",
" print(\"\\n在测试集上预测:\")\n",
" y_pred = emotion_model.model.predict(X_test_reshaped)\n",
" \n",
" # 使用测试集评估模型\n",
" # 定义模型评估函数\n",
" def evaluate_model(y_true, y_pred, class_names):\n",
" \"\"\"\n",
" 评估模型性能\n",
" \n",
" Args:\n",
" y_true: 真实标签(独热编码)\n",
" y_pred: 预测标签(概率)\n",
" class_names: 类别名称\n",
" \n",
" Returns:\n",
" results: 评估结果字典\n",
" \"\"\"\n",
" # 将独热编码转换为类别索引\n",
" y_true_classes = np.argmax(y_true, axis=1)\n",
" y_pred_classes = np.argmax(y_pred, axis=1)\n",
" \n",
" # 计算混淆矩阵\n",
" cm = confusion_matrix(y_true_classes, y_pred_classes)\n",
" \n",
" # 计算每个类别的精确率、召回率和F1分数\n",
" report = classification_report(y_true_classes, y_pred_classes, \n",
" target_names=class_names, output_dict=True)\n",
" \n",
" # 计算准确率\n",
" accuracy = accuracy_score(y_true_classes, y_pred_classes)\n",
" \n",
" # 获取存在于测试集中的类别索引\n",
" present_class_indices = sorted(np.unique(y_true_classes))\n",
" \n",
" # 返回结果\n",
" results = {\n",
" 'confusion_matrix': cm,\n",
" 'report': report,\n",
" 'accuracy': accuracy,\n",
" 'present_class_indices': present_class_indices\n",
" }\n",
" \n",
" return results\n",
" \n",
" # 获取模型的类别名称\n",
" emotion_classes = encoder.classes_\n",
" \n",
" # 评估模型\n",
" results = evaluate_model(y_test_categorical, y_pred, emotion_classes)\n",
" \n",
" # 打印评估结果\n",
" print(f\"准确率: {results['accuracy']:.4f}\")\n",
" \n",
" print(\"\\n分类报告:\")\n",
" report = results['report']\n",
" \n",
" # 打印每个类别的指标\n",
" for class_name in sorted(report.keys()):\n",
" if class_name not in ['accuracy', 'macro avg', 'weighted avg']:\n",
" metrics = report[class_name]\n",
" print(f\"{class_name:>10}: 精确率={metrics['precision']:.4f}, 召回率={metrics['recall']:.4f}, F1分数={metrics['f1-score']:.4f}\")\n",
" \n",
" # 打印平均指标\n",
" print(\"\\n平均指标:\")\n",
" print(f\"宏平均: 精确率={report['macro avg']['precision']:.4f}, 召回率={report['macro avg']['recall']:.4f}, F1分数={report['macro avg']['f1-score']:.4f}\")\n",
" print(f\"加权平均: 精确率={report['weighted avg']['precision']:.4f}, 召回率={report['weighted avg']['recall']:.4f}, F1分数={report['weighted avg']['f1-score']:.4f}\")\n",
" \n",
" # 绘制混淆矩阵\n",
" plt.figure(figsize=(10, 8))\n",
" cm = results['confusion_matrix']\n",
" \n",
" # 使用matplotlib方式绘制混淆矩阵\n",
" plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)\n",
" plt.title('混淆矩阵')\n",
" plt.colorbar()\n",
" tick_marks = np.arange(len(emotion_classes))\n",
" plt.xticks(tick_marks, emotion_classes, rotation=45)\n",
" plt.yticks(tick_marks, emotion_classes)\n",
" \n",
" # 在格子中添加数值\n",
" thresh = cm.max() / 2.\n",
" for i in range(cm.shape[0]):\n",
" for j in range(cm.shape[1]):\n",
" plt.text(j, i, cm[i, j],\n",
" horizontalalignment=\"center\",\n",
" color=\"white\" if cm[i, j] > thresh else \"black\")\n",
" \n",
" plt.tight_layout()\n",
" plt.ylabel('真实标签')\n",
" plt.xlabel('预测标签')\n",
" plt.show()\n",
" \n",
" # 绘制归一化的混淆矩阵\n",
" plt.figure(figsize=(10, 8))\n",
" cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
" cm_normalized = np.round(cm_normalized, 2)\n",
" \n",
" plt.imshow(cm_normalized, interpolation='nearest', cmap=plt.cm.Blues)\n",
" plt.title('归一化混淆矩阵')\n",
" plt.colorbar()\n",
" tick_marks = np.arange(len(emotion_classes))\n",
" plt.xticks(tick_marks, emotion_classes, rotation=45)\n",
" plt.yticks(tick_marks, emotion_classes)\n",
" \n",
" # 在格子中添加数值\n",
" for i in range(cm_normalized.shape[0]):\n",
" for j in range(cm_normalized.shape[1]):\n",
" plt.text(j, i, f\"{cm_normalized[i, j]:.2f}\",\n",
" horizontalalignment=\"center\",\n",
" color=\"white\" if cm_normalized[i, j] > 0.5 else \"black\")\n",
" \n",
" plt.tight_layout()\n",
" plt.ylabel('真实标签')\n",
" plt.xlabel('预测标签')\n",
" plt.show()\n",
" \n",
"except Exception as e:\n",
" print(f\"评估模型时出错: {e}\")\n",
" import traceback\n",
" traceback.print_exc()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}