chuban
This commit is contained in:
243
utils/evaluation.py
Normal file
243
utils/evaluation.py
Normal file
@@ -0,0 +1,243 @@
|
||||
"""
|
||||
评估工具模块,用于评估模型性能
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
||||
from sklearn.metrics import classification_report, confusion_matrix
|
||||
import pandas as pd
|
||||
|
||||
def evaluate_model(y_true, y_pred, class_names=None):
|
||||
"""
|
||||
评估模型性能
|
||||
|
||||
Args:
|
||||
y_true: 真实标签(独热编码形式)
|
||||
y_pred: 预测概率
|
||||
class_names: 类别名称列表
|
||||
|
||||
Returns:
|
||||
评估结果字典
|
||||
"""
|
||||
# 转换为类别索引
|
||||
y_true_classes = np.argmax(y_true, axis=1)
|
||||
y_pred_classes = np.argmax(y_pred, axis=1)
|
||||
|
||||
# 计算混淆矩阵
|
||||
cm = confusion_matrix(y_true_classes, y_pred_classes)
|
||||
|
||||
# 计算准确率
|
||||
accuracy = accuracy_score(y_true_classes, y_pred_classes)
|
||||
|
||||
# 计算精确率、召回率和F1分数
|
||||
precision = precision_score(y_true_classes, y_pred_classes, average='weighted', zero_division=0)
|
||||
recall = recall_score(y_true_classes, y_pred_classes, average='weighted', zero_division=0)
|
||||
f1 = f1_score(y_true_classes, y_pred_classes, average='weighted', zero_division=0)
|
||||
|
||||
# 计算每个类别的指标
|
||||
class_precision = precision_score(y_true_classes, y_pred_classes, average=None, zero_division=0)
|
||||
class_recall = recall_score(y_true_classes, y_pred_classes, average=None, zero_division=0)
|
||||
class_f1 = f1_score(y_true_classes, y_pred_classes, average=None, zero_division=0)
|
||||
|
||||
# 整理每个类别的指标
|
||||
class_metrics = {}
|
||||
|
||||
# 如果提供了类别名称,使用类别名称作为键
|
||||
if class_names is not None and len(class_names) > 0:
|
||||
for i, name in enumerate(class_names):
|
||||
if i < len(class_precision):
|
||||
class_metrics[name] = {
|
||||
'precision': class_precision[i],
|
||||
'recall': class_recall[i],
|
||||
'f1': class_f1[i]
|
||||
}
|
||||
else:
|
||||
# 否则使用类别索引作为键
|
||||
for i in range(len(class_precision)):
|
||||
class_metrics[f'class_{i}'] = {
|
||||
'precision': class_precision[i],
|
||||
'recall': class_recall[i],
|
||||
'f1': class_f1[i]
|
||||
}
|
||||
|
||||
# 返回结果字典
|
||||
return {
|
||||
'accuracy': accuracy,
|
||||
'precision': precision,
|
||||
'recall': recall,
|
||||
'f1': f1,
|
||||
'confusion_matrix': cm,
|
||||
'class_metrics': class_metrics
|
||||
}
|
||||
|
||||
def print_evaluation_results(results, class_names=None):
|
||||
"""
|
||||
打印评估结果
|
||||
|
||||
Args:
|
||||
results: 评估结果字典
|
||||
class_names: 类别名称列表
|
||||
"""
|
||||
print("=" * 50)
|
||||
print("模型评估结果:")
|
||||
print("=" * 50)
|
||||
|
||||
print(f"准确率 (Accuracy): {results['accuracy']:.4f}")
|
||||
print(f"精确率 (Precision): {results['precision']:.4f}")
|
||||
print(f"召回率 (Recall): {results['recall']:.4f}")
|
||||
print(f"F1分数 (F1): {results['f1']:.4f}")
|
||||
|
||||
print("\n混淆矩阵:")
|
||||
print(results['confusion_matrix'])
|
||||
|
||||
if 'class_metrics' in results:
|
||||
print("\n每个类别的指标:")
|
||||
for class_name, metrics in results['class_metrics'].items():
|
||||
print(f"{class_name}:")
|
||||
print(f" 精确率 (Precision): {metrics['precision']:.4f}")
|
||||
print(f" 召回率 (Recall): {metrics['recall']:.4f}")
|
||||
print(f" F1分数 (F1): {metrics['f1']:.4f}")
|
||||
|
||||
print("=" * 50)
|
||||
|
||||
def evaluate_by_language(y_true, y_pred, language, class_names=None):
|
||||
"""
|
||||
按语言分类评估模型性能
|
||||
|
||||
Args:
|
||||
y_true: 真实标签(数值或独热编码)
|
||||
y_pred: 预测标签(数值或预测概率)
|
||||
language: 语言标签数组
|
||||
class_names: 类别名称列表
|
||||
|
||||
Returns:
|
||||
按语言分类的评估结果字典
|
||||
"""
|
||||
# 如果输入是独热编码,转换为类别索引
|
||||
if len(y_true.shape) > 1 and y_true.shape[1] > 1:
|
||||
y_true = np.argmax(y_true, axis=1)
|
||||
|
||||
# 如果输入是预测概率,转换为类别索引
|
||||
if len(y_pred.shape) > 1 and y_pred.shape[1] > 1:
|
||||
y_pred = np.argmax(y_pred, axis=1)
|
||||
|
||||
# 获取所有语言类型
|
||||
unique_languages = np.unique(language)
|
||||
|
||||
results = {}
|
||||
|
||||
# 对每种语言分别计算评估指标
|
||||
for lang in unique_languages:
|
||||
# 筛选特定语言的样本
|
||||
mask = (language == lang)
|
||||
if np.sum(mask) == 0:
|
||||
continue
|
||||
|
||||
lang_y_true = y_true[mask]
|
||||
lang_y_pred = y_pred[mask]
|
||||
|
||||
# 计算该语言的评估指标
|
||||
lang_results = evaluate_model(lang_y_true, lang_y_pred, class_names)
|
||||
results[lang] = lang_results
|
||||
|
||||
return results
|
||||
|
||||
def print_evaluation_by_language(results, languages=None):
|
||||
"""
|
||||
打印按语言分类的评估结果
|
||||
|
||||
Args:
|
||||
results: 按语言分类的评估结果字典
|
||||
languages: 语言名称字典,将语言代码映射到语言名称
|
||||
"""
|
||||
if languages is None:
|
||||
languages = {
|
||||
'zh': '中文',
|
||||
'en': '英文'
|
||||
}
|
||||
|
||||
for lang, lang_results in results.items():
|
||||
lang_name = languages.get(lang, lang)
|
||||
print("=" * 50)
|
||||
print(f"{lang_name}数据评估结果:")
|
||||
print("=" * 50)
|
||||
print_evaluation_results(lang_results)
|
||||
|
||||
def get_top_n_predictions(probabilities, class_names, n=3):
|
||||
"""
|
||||
获取概率最高的前N个预测结果
|
||||
|
||||
Args:
|
||||
probabilities: 预测概率数组
|
||||
class_names: 类别名称列表
|
||||
n: 获取前N个结果
|
||||
|
||||
Returns:
|
||||
包含前N个预测及其概率的字典
|
||||
"""
|
||||
# 找到前N个最高概率的索引
|
||||
top_n_indices = np.argsort(probabilities)[-n:][::-1]
|
||||
|
||||
# 获取对应的类别名称和概率
|
||||
top_n_classes = [class_names[i] for i in top_n_indices]
|
||||
top_n_probs = [probabilities[i] for i in top_n_indices]
|
||||
|
||||
# 构建结果
|
||||
result = {
|
||||
'classes': top_n_classes,
|
||||
'probabilities': top_n_probs
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
def get_emotion_accuracy_by_speaker(y_true, y_pred, speaker_ids, emotions):
|
||||
"""
|
||||
计算每个说话者在每种情感上的准确率
|
||||
|
||||
Args:
|
||||
y_true: 真实标签
|
||||
y_pred: 预测标签
|
||||
speaker_ids: 说话者ID数组
|
||||
emotions: 情感标签数组
|
||||
|
||||
Returns:
|
||||
说话者-情感准确率矩阵
|
||||
"""
|
||||
# 如果输入是独热编码,转换为类别索引
|
||||
if len(y_true.shape) > 1 and y_true.shape[1] > 1:
|
||||
y_true = np.argmax(y_true, axis=1)
|
||||
|
||||
# 如果输入是预测概率,转换为类别索引
|
||||
if len(y_pred.shape) > 1 and y_pred.shape[1] > 1:
|
||||
y_pred = np.argmax(y_pred, axis=1)
|
||||
|
||||
# 获取所有说话者ID和情感类别
|
||||
unique_speakers = np.unique(speaker_ids)
|
||||
unique_emotions = np.unique(emotions)
|
||||
|
||||
# 创建准确率矩阵
|
||||
accuracy_matrix = np.zeros((len(unique_speakers), len(unique_emotions)))
|
||||
|
||||
# 计算每个说话者在每种情感上的准确率
|
||||
for i, speaker in enumerate(unique_speakers):
|
||||
for j, emotion in enumerate(unique_emotions):
|
||||
# 筛选特定说话者和情感的样本
|
||||
mask = (speaker_ids == speaker) & (emotions == emotion)
|
||||
if np.sum(mask) == 0:
|
||||
accuracy_matrix[i, j] = np.nan
|
||||
continue
|
||||
|
||||
# 计算准确率
|
||||
speaker_emotion_true = y_true[mask]
|
||||
speaker_emotion_pred = y_pred[mask]
|
||||
accuracy = accuracy_score(speaker_emotion_true, speaker_emotion_pred)
|
||||
accuracy_matrix[i, j] = accuracy
|
||||
|
||||
# 创建DataFrame
|
||||
accuracy_df = pd.DataFrame(
|
||||
accuracy_matrix,
|
||||
index=unique_speakers,
|
||||
columns=unique_emotions
|
||||
)
|
||||
|
||||
return accuracy_df
|
||||
Reference in New Issue
Block a user