""" 特征提取模块,用于从音频信号中提取声学特征 """ import numpy as np import pandas as pd import librosa from tqdm import tqdm from sklearn.preprocessing import StandardScaler import pickle import os def extract_features(audio, sr=22050): """ 从单个音频信号提取特征(兼容旧函数名) Args: audio: 音频信号 sr: 采样率 Returns: features: 特征字典 """ return extract_features_single(audio, sr) def extract_features_single(audio, sr=22050): """ 从单个音频信号提取特征 Args: audio: 音频信号 sr: 采样率 Returns: features: 特征字典 """ # 确保音频长度统一 max_samples = sr * 5 # 最大5秒 if len(audio) < max_samples: # 音频太短,用0填充 padding = max_samples - len(audio) audio = np.pad(audio, (0, padding), 'constant') else: # 音频太长,截断 audio = audio[:max_samples] # 初始化特征字典 features = {} # 提取ZCR(过零率) zcr = librosa.feature.zero_crossing_rate(audio)[0] features['zero_crossing_rate_mean'] = np.mean(zcr) features['zero_crossing_rate_std'] = np.std(zcr) features['zero_crossing_rate_max'] = np.max(zcr) features['zero_crossing_rate_min'] = np.min(zcr) # 提取RMS(均方根能量) rms = librosa.feature.rms(y=audio)[0] features['rms_mean'] = np.mean(rms) features['rms_std'] = np.std(rms) features['rms_max'] = np.max(rms) features['rms_min'] = np.min(rms) # 提取MFCC(梅尔频率倒谱系数) mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13) for i in range(1, 14): features[f'mfcc_{i}_mean'] = np.mean(mfccs[i-1]) features[f'mfcc_{i}_std'] = np.std(mfccs[i-1]) features[f'mfcc_{i}_max'] = np.max(mfccs[i-1]) features[f'mfcc_{i}_min'] = np.min(mfccs[i-1]) # 提取频谱质心 spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0] features['spectral_centroid_mean'] = np.mean(spectral_centroid) features['spectral_centroid_std'] = np.std(spectral_centroid) features['spectral_centroid_max'] = np.max(spectral_centroid) features['spectral_centroid_min'] = np.min(spectral_centroid) # 提取频谱带宽 spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0] features['spectral_bandwidth_mean'] = np.mean(spectral_bandwidth) features['spectral_bandwidth_std'] = np.std(spectral_bandwidth) features['spectral_bandwidth_max'] = np.max(spectral_bandwidth) features['spectral_bandwidth_min'] = np.min(spectral_bandwidth) # 提取频谱衰减 spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0] features['spectral_rolloff_mean'] = np.mean(spectral_rolloff) features['spectral_rolloff_std'] = np.std(spectral_rolloff) features['spectral_rolloff_max'] = np.max(spectral_rolloff) features['spectral_rolloff_min'] = np.min(spectral_rolloff) # 提取色度特征 chroma = librosa.feature.chroma_stft(y=audio, sr=sr) features['chroma_1_mean'] = np.mean(chroma[0]) features['chroma_2_mean'] = np.mean(chroma[1]) features['chroma_3_mean'] = np.mean(chroma[2]) features['chroma_4_mean'] = np.mean(chroma[3]) features['chroma_5_mean'] = np.mean(chroma[4]) # 提取声音谱(Mel频谱) mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr) features['mel_spec_mean'] = np.mean(mel_spec) features['mel_spec_std'] = np.std(mel_spec) # 提取对数功率谱 log_power = librosa.amplitude_to_db(mel_spec) features['log_power_mean'] = np.mean(log_power) features['log_power_std'] = np.std(log_power) # 添加统计矩 features['audio_mean'] = np.mean(audio) features['audio_std'] = np.std(audio) features['audio_skew'] = np.mean((audio - np.mean(audio))**3) / (np.std(audio)**3) features['audio_kurtosis'] = np.mean((audio - np.mean(audio))**4) / (np.std(audio)**4) - 3 # 估计音高 pitches, magnitudes = librosa.piptrack(y=audio, sr=sr) pitches_mean = [] for t in range(pitches.shape[1]): idx = np.argmax(magnitudes[:, t]) pitch = pitches[idx, t] if pitch > 0: # 过滤掉静音帧 pitches_mean.append(pitch) if pitches_mean: # 确保有有效的音高值 features['pitch_mean'] = np.mean(pitches_mean) features['pitch_std'] = np.std(pitches_mean) if len(pitches_mean) > 1 else 0 features['pitch_max'] = np.max(pitches_mean) features['pitch_min'] = np.min(pitches_mean) if len(pitches_mean) > 0 else 0 else: features['pitch_mean'] = 0 features['pitch_std'] = 0 features['pitch_max'] = 0 features['pitch_min'] = 0 # 提取调谐偏差 tuning_offset = librosa.estimate_tuning(y=audio, sr=sr) features['tuning_offset'] = tuning_offset # 新增特征 2:提取光谱平坦度指标 spectral_flatness = librosa.feature.spectral_flatness(y=audio)[0] features['spectral_flatness_mean'] = np.mean(spectral_flatness) features['spectral_flatness_std'] = np.std(spectral_flatness) features['spectral_flatness_max'] = np.max(spectral_flatness) features['spectral_flatness_min'] = np.min(spectral_flatness) # 新增特征 3:提取光谱对比度指标 spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr) # 为每个频带提取统计特征 for i in range(spectral_contrast.shape[0]): features[f'spectral_contrast_{i+1}_mean'] = np.mean(spectral_contrast[i]) features[f'spectral_contrast_{i+1}_std'] = np.std(spectral_contrast[i]) # 光谱对比度的总体统计 features['spectral_contrast_mean'] = np.mean(spectral_contrast) features['spectral_contrast_std'] = np.std(spectral_contrast) # 新增特征 4:梅尔频率特征扩展(针对图片中提到的梅尔频率) mfcc_delta = librosa.feature.delta(mfccs) mfcc_delta2 = librosa.feature.delta(mfccs, order=2) # 添加一阶差分特征 for i in range(1, 14): features[f'mfcc_{i}_delta_mean'] = np.mean(mfcc_delta[i-1]) features[f'mfcc_{i}_delta_std'] = np.std(mfcc_delta[i-1]) # 添加二阶差分特征 for i in range(1, 14): features[f'mfcc_{i}_delta2_mean'] = np.mean(mfcc_delta2[i-1]) features[f'mfcc_{i}_delta2_std'] = np.std(mfcc_delta2[i-1]) return features def extract_features_batch(audio_list): """ 批量提取音频特征 Args: audio_list: 音频信号列表 Returns: features_list: 特征字典列表 """ features_list = [] for i, audio in enumerate(tqdm(audio_list, desc="提取特征")): try: features = extract_features_single(audio) features_list.append(features) except Exception as e: print(f"处理第{i}个音频时出错: {e}") # 添加空特征字典,避免索引错误 features_list.append({}) return features_list def features_to_matrix(features_list, feature_names=None): """ 将特征字典列表转换为特征矩阵 Args: features_list: 特征字典列表 feature_names: 特征名称列表,如果为None则从第一个非空字典中获取 Returns: X: 特征矩阵 feature_names: 特征名称列表 """ # 如果没有提供特征名称,从第一个非空字典中获取 if feature_names is None: for features in features_list: if features: # 非空字典 feature_names = list(features.keys()) break if feature_names is None: raise ValueError("所有特征字典都是空的,无法确定特征名称") # 创建特征矩阵 X = np.zeros((len(features_list), len(feature_names))) for i, features in enumerate(features_list): if not features: # 空字典 # 填充为0,或者可以使用平均值等 continue for j, name in enumerate(feature_names): if name in features: X[i, j] = features[name] return X, feature_names def normalize_features(X_train, X_val, X_test): """ 标准化特征 Args: X_train: 训练集特征矩阵 X_val: 验证集特征矩阵 X_test: 测试集特征矩阵 Returns: X_train_norm: 标准化后的训练集 X_val_norm: 标准化后的验证集 X_test_norm: 标准化后的测试集 """ # 初始化标准化器 scaler = StandardScaler() # 使用训练集拟合标准化器 scaler.fit(X_train) # 转换所有数据集 X_train_norm = scaler.transform(X_train) X_val_norm = scaler.transform(X_val) X_test_norm = scaler.transform(X_test) # 确保输出目录存在 output_dir = 'output/emotion_model' os.makedirs(output_dir, exist_ok=True) # 保存标准化器 with open(os.path.join(output_dir, 'feature_scaler.pkl'), 'wb') as f: pickle.dump(scaler, f) return X_train_norm, X_val_norm, X_test_norm def normalize_features_with_params(X, scaler): """ 使用给定的缩放参数标准化特征 Args: X: 特征矩阵 scaler: 已经拟合的标准化器 Returns: X_norm: 标准化后的特征矩阵 """ return scaler.transform(X) def reshape_for_lstm(X): """ 将特征矩阵重塑为LSTM输入格式 Args: X: 特征矩阵,或特征矩阵列表 Returns: X_reshaped: 重塑后的特征矩阵 """ # 如果输入是列表,转换为数组 if isinstance(X, list): X = np.array(X) # 添加时间步维度 if len(X.shape) == 2: return X.reshape(X.shape[0], 1, X.shape[1]) # 如果已经是3D,直接返回 return X