賽題介紹
在分布式系統(tǒng)中某個節(jié)點發(fā)生故障時,故障會沿著分布式系統(tǒng)的拓?fù)浣Y(jié)構(gòu)進(jìn)行傳播,造成自身節(jié)點及其鄰接節(jié)點相關(guān)的KPI指標(biāo)和發(fā)生大量日志異常。本次比賽提供分布式數(shù)據(jù)庫的故障特征數(shù)據(jù)和標(biāo)簽數(shù)據(jù),其中特征數(shù)據(jù)是系統(tǒng)發(fā)生故障時的KPI指標(biāo)數(shù)據(jù),KPI指標(biāo)包括由feature0、feature1 …feature106共107個指標(biāo),標(biāo)簽數(shù)據(jù)為故障類別數(shù)據(jù),共6個類別,用0、1、2、3、4、5分別表示6個故障,參賽人員可根據(jù)這些數(shù)據(jù),借助機(jī)器學(xué)習(xí)、深度學(xué)習(xí)、web等技術(shù)搭建故障診斷系統(tǒng),該系統(tǒng)支持用戶上傳訓(xùn)練集對模型進(jìn)行訓(xùn)練和模型下載,同時支持用戶上傳單條或多條測試語句進(jìn)行測試并可視化測試結(jié)果,支持測試結(jié)果下載。
?
baseline: DecisionTree
數(shù)據(jù)分析
讀取數(shù)據(jù)
df = pd.read_csv('data/train/train.csv', index_col=None)
判斷是否有缺失值
df.isnull().any()
'''
output: True即為存在缺失值
sample_id False
feature0 True
feature1 True
feature2 True
feature3 True
...
feature103 True
feature104 True
feature105 False
feature106 True
label False
Length: 109, dtype: bool
'''
數(shù)據(jù)標(biāo)準(zhǔn)化及缺失值填充
# 數(shù)據(jù)標(biāo)準(zhǔn)化
features = df.iloc[:, 1:-1]
numeric_features = features.dtypes[features.dtypes != 'object'].index
features[numeric_features] = features[numeric_features].apply(
lambda x: (x - x.mean()) / (x.std())
)
# 在標(biāo)準(zhǔn)化數(shù)據(jù)之后,所有均值消失,因此我們可以將缺失值設(shè)置為0
features[numeric_features] = features[numeric_features].fillna(0)
features_labels = pd.concat([features, df[['label']]], axis=1)
train_features = pd.concat([df[['sample_id']], features], axis=1)
train_label = df[['sample_id', 'label']]
df = pd.concat([train_features, train_label[['label']]], axis=1)
觀察數(shù)據(jù)基本信息
# 觀察前五行數(shù)據(jù)
df.head()
?文章來源:http://www.zghlxwxcb.cn/news/detail-514446.html
# 數(shù)據(jù)大小
df.shape
'''
output:
(6296, 109)
'''
df.dtypes
'''
output:
sample_id int64
feature0 float64
feature1 float64
feature2 float64
feature3 float64
...
feature103 float64
feature104 float64
feature105 float64
feature106 float64
label int64
Length: 109, dtype: object
'''
# 類別分布
df['label'].value_counts().sort_index().plot(kind='bar')
plt.show()
df['label'].value_counts().sort_index().plot(kind='pie')
plt.show()
?
?
features.describe()
?
# 分組的平均數(shù)據(jù)統(tǒng)計
label_Summary = features_labels.groupby('label')
label_Summary.mean()
?
# 相關(guān)性矩陣
corr = features_labels.corr()
sns.set_context({'figure.figsize':[100, 100]})
fig = sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
heatmap = fig.get_figure()
heatmap.savefig('work/heatmap.png', dpi=300)
corr
?
# 各個特征的概率密度函數(shù)
feature_names = features.columns.values.tolist()
for name in feature_names:
fig = plt.figure(figsize=(15, 4), )
ax = sns.kdeplot(df.loc[(df['label'] == 0), name], color='b', shade=True, label='0')
ax = sns.kdeplot(df.loc[(df['label'] == 1), name], color='r', shade=True, label='1')
ax = sns.kdeplot(df.loc[(df['label'] == 2), name], color='g', shade=True, label='2')
ax = sns.kdeplot(df.loc[(df['label'] == 3), name], color='y', shade=True, label='3')
ax = sns.kdeplot(df.loc[(df['label'] == 4), name], color='m', shade=True, label='4')
ax = sns.kdeplot(df.loc[(df['label'] == 5), name], color='c', shade=True, label='5')
ax.set(xlabel=name, ylabel='頻率')
plt.title('{} Probabilitydensity function'.format(name))
plt.savefig('work/{}的概率密度函數(shù)圖.png'.format(name))
文章來源地址http://www.zghlxwxcb.cn/news/detail-514446.html
?劃分?jǐn)?shù)據(jù)集
from sklearn.model_selection import train_test_split
target_name = 'label'
x = df.drop(['sample_id', 'label'], axis=1)
y = df[['label']]
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.15, random_state=123, stratify=y)
模型訓(xùn)練
from sklearn.tree import DecisionTreeClassifier
# 實例化
dtree = tree.DecisionTreeClassifier(
criterion='entropy',
min_weight_fraction_leaf=0.01
)
# train
dtree = dtree.fit(x_train, y_train)
評價指標(biāo)計算
# 指標(biāo)計算 參數(shù):array
def metrics_calculate(pred, y_test, txt_path):
TP = [0, 0, 0, 0, 0, 0]
FP = [0, 0, 0, 0, 0, 0]
FN = [0, 0, 0, 0, 0, 0]
for i in range(len(y_test)):
if pred[i] == 0 and y_test[i] == 0:
TP[0] += 1
if pred[i] != 0 and y_test[i] == 0:
FN[0] += 1
if pred[i] == 0 and y_test[i] != 0:
FP[0] += 1
if pred[i] == 1 and y_test[i] == 1:
TP[1] += 1
if pred[i] != 1 and y_test[i] == 1:
FN[1] += 1
if pred[i] == 1 and y_test[i] != 1:
FP[1] += 1
if pred[i] == 2 and y_test[i] == 2:
TP[2] += 1
if pred[i] != 2 and y_test[i] == 2:
FN[2] += 1
if pred[i] == 2 and y_test[i] != 2:
FP[2] += 1
if pred[i] == 3 and y_test[i] == 3:
TP[3] += 1
if pred[i] != 3 and y_test[i] == 3:
FN[3] += 1
if pred[i] == 3 and y_test[i] != 3:
FP[3] += 1
if pred[i] == 4 and y_test[i] == 4:
TP[4] += 1
if pred[i] != 4 and y_test[i] == 4:
FN[4] += 1
if pred[i] == 4 and y_test[i] != 4:
FP[4] += 1
if pred[i] == 5 and y_test[i] == 5:
TP[5] += 1
if pred[i] != 5 and y_test[i] == 5:
FN[5] += 1
if pred[i] == 5 and y_test[i] != 5:
FP[5] += 1
Precision = [0, 0, 0, 0, 0, 0]
Recall = [0, 0, 0, 0, 0, 0]
F1 = [0, 0, 0, 0, 0, 0]
Precision[0] = TP[0] / (TP[0] + FP[0])
Precision[1] = TP[1] / (TP[1] + FP[1])
Precision[2] = TP[2] / (TP[2] + FP[2])
Precision[3] = TP[3] / (TP[3] + FP[3])
Precision[4] = TP[4] / (TP[4] + FP[4])
Precision[5] = TP[5] / (TP[5] + FP[5])
for i in range(6):
print('Precision: {}\n'.format(Precision[i]))
Recall[0] = TP[0] / (TP[0] + FN[0])
Recall[1] = TP[1] / (TP[1] + FN[1])
Recall[2] = TP[2] / (TP[2] + FN[2])
Recall[3] = TP[3] / (TP[3] + FN[3])
Recall[4] = TP[4] / (TP[4] + FN[4])
Recall[5] = TP[5] / (TP[5] + FN[5])
for i in range(6):
print('Recall: {}\n'.format(Recall[i]))
F1[0] = (2 * Precision[0] * Recall[0]) / (Precision[0] + Recall[0])
F1[1] = (2 * Precision[1] * Recall[1]) / (Precision[1] + Recall[1])
F1[2] = (2 * Precision[2] * Recall[2]) / (Precision[2] + Recall[2])
F1[3] = (2 * Precision[3] * Recall[3]) / (Precision[3] + Recall[3])
F1[4] = (2 * Precision[4] * Recall[4]) / (Precision[4] + Recall[4])
F1[5] = (2 * Precision[5] * Recall[5]) / (Precision[5] + Recall[5])
for i in range(6):
print('F1: {}\n'.format(F1[i]))
Macro_Precision = sum([Precision[0], Precision[1], Precision[2],
Precision[3], Precision[4], Precision[5]]) / 6
Macro_Recall = sum([Recall[0], Recall[1], Recall[2],
Recall[3], Recall[4], Recall[5]]) / 6
Macro_F1 = sum([F1[0], F1[1], F1[2], F1[3], F1[4], F1[5]]) / 6
l_sum = sum([TP[0], TP[1], TP[2], TP[3], TP[4], TP[5]])
m_sum = l_sum + sum([FP[0], FP[1], FP[2], FP[3], FP[4], FP[5]])
n_sum = l_sum + sum([FN[0], FN[1], FN[2], FN[3], FN[4], FN[5]])
Micro_Precision = l_sum / m_sum
print('Micro_Precision: {}\n'.format(Micro_Precision))
Micro_Recall = l_sum / n_sum
print('Micro_Recall: {}\n'.format(Micro_Recall))
Micro_F1 = (2 * Micro_Precision * Micro_Recall) / (Micro_Precision + Micro_Recall)
print('Micro_F1: {}\n'.format(Micro_F1))
f = open(txt_path, 'a', encoding='utf-8')
for i in range(6):
f.write('類別{}: '.format(i))
f.write('\n')
f.write('Precision: {:.2f}%'.format(Precision[i] * 100))
f.write('\n')
f.write('Recall: {:.2f}%'.format(Recall[i] * 100))
f.write('\n')
f.write('F1: {:.2f}'.format(F1[i]))
f.write('\n')
f.write('Macro_Precision: {:.2f}%'.format(Macro_Precision * 100))
f.write('\n')
f.write('Macro_Recall: {:.2f}%'.format(Macro_Recall * 100))
f.write('\n')
f.write('Macro_F1: {:.2f}'.format(Macro_F1))
f.write('\n')
f.write('Micro_Precision: {:.2f}%'.format(Micro_Precision * 100))
f.write('\n')
f.write('Micro_Recall: {:.2f}%'.format(Micro_Recall * 100))
f.write('\n')
f.write('Micro_F1: {:.2f}'.format(Micro_F1))
f.write('\n')
f.close()
驗證模型
# 驗證
pred = dtree.predict(x_test)
y_test = y_test.reshape((-1, ))
txt_path = 'work/result_RandomForest.txt'
metrics_calculate(pred, y_test, txt_path)
'''
Precision: 0.8382066276803118
Precision: 0.6823529411764706
Precision: 0.7553956834532374
Precision: 0.7368421052631579
Precision: 0.972972972972973
Precision: 0.8157894736842105
Recall: 0.8829568788501027
Recall: 0.5858585858585859
Recall: 0.6907894736842105
Recall: 0.8433734939759037
Recall: 0.6792452830188679
Recall: 0.8732394366197183
F1: 0.86
F1: 0.6304347826086957
F1: 0.7216494845360826
F1: 0.7865168539325843
F1: 0.7999999999999999
F1: 0.8435374149659864
Micro_Precision: 0.8052910052910053
Micro_Recall: 0.8052910052910053
Micro_F1: 0.8052910052910053
'''
到了這里,關(guān)于第十二屆“中國軟件杯”大賽:A10-基于機(jī)器學(xué)習(xí)的分布式系統(tǒng)故障診斷系統(tǒng)——baseline(一)的文章就介紹完了。如果您還想了解更多內(nèi)容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!