Files
yuyinfenxi/data_utils/feature_extractor.py
2025-07-02 13:54:05 +08:00

302 lines
9.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
特征提取模块,用于从音频信号中提取声学特征
"""
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import pickle
import os
def extract_features(audio, sr=22050):
"""
从单个音频信号提取特征(兼容旧函数名)
Args:
audio: 音频信号
sr: 采样率
Returns:
features: 特征字典
"""
return extract_features_single(audio, sr)
def extract_features_single(audio, sr=22050):
"""
从单个音频信号提取特征
Args:
audio: 音频信号
sr: 采样率
Returns:
features: 特征字典
"""
# 确保音频长度统一
max_samples = sr * 5 # 最大5秒
if len(audio) < max_samples:
# 音频太短用0填充
padding = max_samples - len(audio)
audio = np.pad(audio, (0, padding), 'constant')
else:
# 音频太长,截断
audio = audio[:max_samples]
# 初始化特征字典
features = {}
# 提取ZCR过零率
zcr = librosa.feature.zero_crossing_rate(audio)[0]
features['zero_crossing_rate_mean'] = np.mean(zcr)
features['zero_crossing_rate_std'] = np.std(zcr)
features['zero_crossing_rate_max'] = np.max(zcr)
features['zero_crossing_rate_min'] = np.min(zcr)
# 提取RMS均方根能量
rms = librosa.feature.rms(y=audio)[0]
features['rms_mean'] = np.mean(rms)
features['rms_std'] = np.std(rms)
features['rms_max'] = np.max(rms)
features['rms_min'] = np.min(rms)
# 提取MFCC梅尔频率倒谱系数
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
for i in range(1, 14):
features[f'mfcc_{i}_mean'] = np.mean(mfccs[i-1])
features[f'mfcc_{i}_std'] = np.std(mfccs[i-1])
features[f'mfcc_{i}_max'] = np.max(mfccs[i-1])
features[f'mfcc_{i}_min'] = np.min(mfccs[i-1])
# 提取频谱质心
spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)[0]
features['spectral_centroid_mean'] = np.mean(spectral_centroid)
features['spectral_centroid_std'] = np.std(spectral_centroid)
features['spectral_centroid_max'] = np.max(spectral_centroid)
features['spectral_centroid_min'] = np.min(spectral_centroid)
# 提取频谱带宽
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)[0]
features['spectral_bandwidth_mean'] = np.mean(spectral_bandwidth)
features['spectral_bandwidth_std'] = np.std(spectral_bandwidth)
features['spectral_bandwidth_max'] = np.max(spectral_bandwidth)
features['spectral_bandwidth_min'] = np.min(spectral_bandwidth)
# 提取频谱衰减
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)[0]
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
features['spectral_rolloff_std'] = np.std(spectral_rolloff)
features['spectral_rolloff_max'] = np.max(spectral_rolloff)
features['spectral_rolloff_min'] = np.min(spectral_rolloff)
# 提取色度特征
chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
features['chroma_1_mean'] = np.mean(chroma[0])
features['chroma_2_mean'] = np.mean(chroma[1])
features['chroma_3_mean'] = np.mean(chroma[2])
features['chroma_4_mean'] = np.mean(chroma[3])
features['chroma_5_mean'] = np.mean(chroma[4])
# 提取声音谱Mel频谱
mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr)
features['mel_spec_mean'] = np.mean(mel_spec)
features['mel_spec_std'] = np.std(mel_spec)
# 提取对数功率谱
log_power = librosa.amplitude_to_db(mel_spec)
features['log_power_mean'] = np.mean(log_power)
features['log_power_std'] = np.std(log_power)
# 添加统计矩
features['audio_mean'] = np.mean(audio)
features['audio_std'] = np.std(audio)
features['audio_skew'] = np.mean((audio - np.mean(audio))**3) / (np.std(audio)**3)
features['audio_kurtosis'] = np.mean((audio - np.mean(audio))**4) / (np.std(audio)**4) - 3
# 估计音高
pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
pitches_mean = []
for t in range(pitches.shape[1]):
idx = np.argmax(magnitudes[:, t])
pitch = pitches[idx, t]
if pitch > 0: # 过滤掉静音帧
pitches_mean.append(pitch)
if pitches_mean: # 确保有有效的音高值
features['pitch_mean'] = np.mean(pitches_mean)
features['pitch_std'] = np.std(pitches_mean) if len(pitches_mean) > 1 else 0
features['pitch_max'] = np.max(pitches_mean)
features['pitch_min'] = np.min(pitches_mean) if len(pitches_mean) > 0 else 0
else:
features['pitch_mean'] = 0
features['pitch_std'] = 0
features['pitch_max'] = 0
features['pitch_min'] = 0
# 提取调谐偏差
tuning_offset = librosa.estimate_tuning(y=audio, sr=sr)
features['tuning_offset'] = tuning_offset
# 新增特征 2提取光谱平坦度指标
spectral_flatness = librosa.feature.spectral_flatness(y=audio)[0]
features['spectral_flatness_mean'] = np.mean(spectral_flatness)
features['spectral_flatness_std'] = np.std(spectral_flatness)
features['spectral_flatness_max'] = np.max(spectral_flatness)
features['spectral_flatness_min'] = np.min(spectral_flatness)
# 新增特征 3提取光谱对比度指标
spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
# 为每个频带提取统计特征
for i in range(spectral_contrast.shape[0]):
features[f'spectral_contrast_{i+1}_mean'] = np.mean(spectral_contrast[i])
features[f'spectral_contrast_{i+1}_std'] = np.std(spectral_contrast[i])
# 光谱对比度的总体统计
features['spectral_contrast_mean'] = np.mean(spectral_contrast)
features['spectral_contrast_std'] = np.std(spectral_contrast)
# 新增特征 4梅尔频率特征扩展针对图片中提到的梅尔频率
mfcc_delta = librosa.feature.delta(mfccs)
mfcc_delta2 = librosa.feature.delta(mfccs, order=2)
# 添加一阶差分特征
for i in range(1, 14):
features[f'mfcc_{i}_delta_mean'] = np.mean(mfcc_delta[i-1])
features[f'mfcc_{i}_delta_std'] = np.std(mfcc_delta[i-1])
# 添加二阶差分特征
for i in range(1, 14):
features[f'mfcc_{i}_delta2_mean'] = np.mean(mfcc_delta2[i-1])
features[f'mfcc_{i}_delta2_std'] = np.std(mfcc_delta2[i-1])
return features
def extract_features_batch(audio_list):
"""
批量提取音频特征
Args:
audio_list: 音频信号列表
Returns:
features_list: 特征字典列表
"""
features_list = []
for i, audio in enumerate(tqdm(audio_list, desc="提取特征")):
try:
features = extract_features_single(audio)
features_list.append(features)
except Exception as e:
print(f"处理第{i}个音频时出错: {e}")
# 添加空特征字典,避免索引错误
features_list.append({})
return features_list
def features_to_matrix(features_list, feature_names=None):
"""
将特征字典列表转换为特征矩阵
Args:
features_list: 特征字典列表
feature_names: 特征名称列表如果为None则从第一个非空字典中获取
Returns:
X: 特征矩阵
feature_names: 特征名称列表
"""
# 如果没有提供特征名称,从第一个非空字典中获取
if feature_names is None:
for features in features_list:
if features: # 非空字典
feature_names = list(features.keys())
break
if feature_names is None:
raise ValueError("所有特征字典都是空的,无法确定特征名称")
# 创建特征矩阵
X = np.zeros((len(features_list), len(feature_names)))
for i, features in enumerate(features_list):
if not features: # 空字典
# 填充为0或者可以使用平均值等
continue
for j, name in enumerate(feature_names):
if name in features:
X[i, j] = features[name]
return X, feature_names
def normalize_features(X_train, X_val, X_test):
"""
标准化特征
Args:
X_train: 训练集特征矩阵
X_val: 验证集特征矩阵
X_test: 测试集特征矩阵
Returns:
X_train_norm: 标准化后的训练集
X_val_norm: 标准化后的验证集
X_test_norm: 标准化后的测试集
"""
# 初始化标准化器
scaler = StandardScaler()
# 使用训练集拟合标准化器
scaler.fit(X_train)
# 转换所有数据集
X_train_norm = scaler.transform(X_train)
X_val_norm = scaler.transform(X_val)
X_test_norm = scaler.transform(X_test)
# 确保输出目录存在
output_dir = 'output/emotion_model'
os.makedirs(output_dir, exist_ok=True)
# 保存标准化器
with open(os.path.join(output_dir, 'feature_scaler.pkl'), 'wb') as f:
pickle.dump(scaler, f)
return X_train_norm, X_val_norm, X_test_norm
def normalize_features_with_params(X, scaler):
"""
使用给定的缩放参数标准化特征
Args:
X: 特征矩阵
scaler: 已经拟合的标准化器
Returns:
X_norm: 标准化后的特征矩阵
"""
return scaler.transform(X)
def reshape_for_lstm(X):
"""
将特征矩阵重塑为LSTM输入格式
Args:
X: 特征矩阵,或特征矩阵列表
Returns:
X_reshaped: 重塑后的特征矩阵
"""
# 如果输入是列表,转换为数组
if isinstance(X, list):
X = np.array(X)
# 添加时间步维度
if len(X.shape) == 2:
return X.reshape(X.shape[0], 1, X.shape[1])
# 如果已经是3D直接返回
return X