這篇文章主要是整理了一些作者在各種建模比賽中遇到的數(shù)據(jù)預(yù)處理問(wèn)題以及方法,主要針對(duì)excel或csv格式的數(shù)據(jù),為后續(xù)進(jìn)行機(jī)器學(xué)習(xí)或深度學(xué)習(xí)做前期準(zhǔn)備
數(shù)據(jù)清洗
導(dǎo)入庫(kù)和文件,這里使用的是絕對(duì)路徑,可改為相對(duì)路徑
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 讀取xls數(shù)據(jù)文件
data = pd.read_csv(r'D:\1112222.csv',encoding='gbk')
傳入的為csv格式的文件,如果是xlsx格式的文件,建議先使用excel另存為csv結(jié)尾格式的文件再進(jìn)行操作,選擇下圖所示選項(xiàng):
合并多個(gè)文件數(shù)據(jù)
可能同時(shí)存在多個(gè)csv文件,需要進(jìn)行合并
import glob,os
filenames_in = r'D:\in' # 輸入文件的文件地址
filenames_out = r'D:\inner' # 新文件的地址
path_in = r'D:\in'
file_names = os.listdir(path_in)
file_paths = glob.glob(os.path.join(path_in,'*.csv'))
print(file_paths)
df1 = pd.DataFrame()
for file in file_paths:
df2 = pd.read_csv(file,sep=',',header=None)
#df2=df2.iloc[:,2] #只取第三列
df1 = pd.concat([df1, df2], axis=0) #axis=0意思是縱向拼接,=1的時(shí)候是橫向拼接
print('dataframe的維度是:', df1.shape)
#print(df1)
# 輸出數(shù)據(jù)到本地
df1.to_csv(r'D:\inner\result.csv', index=False, sep=',')
填補(bǔ)缺失值
對(duì)于數(shù)字類型的數(shù)據(jù),用均值填補(bǔ)空值;對(duì)于字符類型的數(shù)據(jù),用眾數(shù)填補(bǔ)空值
def fill_missing_values(df):
"""用DataFrame中各列的均值或眾數(shù)來(lái)填補(bǔ)空值"""
for column in df:
if df[column].dtype == np.number: # 如果數(shù)據(jù)是數(shù)字類型
mean = df[column].mean()
df[column].fillna(mean, inplace=True) # 用均值填補(bǔ)空值
else: # 如果數(shù)據(jù)不是數(shù)字類型
mode = df[column].mode().iloc[0] # 找到最頻繁出現(xiàn)的項(xiàng)
df[column].fillna(mode, inplace=True) # 用眾數(shù)填補(bǔ)空值
return df
data=fill_missing_values(data)
去除數(shù)據(jù)中的符號(hào)
對(duì)于有些數(shù)據(jù),可能含有空格、中英文標(biāo)點(diǎn),需要進(jìn)行去除
import string
from zhon.hanzi import punctuation
punctuation_string = string.punctuation
for i in punctuation_string:
data= data.replace(i, '')
punctuation_str = punctuation
for i in punctuation_str:
data = data.replace(i, '')
當(dāng)然,如果遇到這種數(shù)據(jù),我會(huì)更建議使用excel自帶的功能進(jìn)行手動(dòng)去除,比調(diào)代碼更快(畢竟有時(shí)候可能調(diào)代碼半天手動(dòng)早就做好了)
如圖,c列是一列數(shù)據(jù)前面有空格的數(shù)據(jù),這時(shí)我們只需要在c列后插入新的一列,并在第一行中輸入c列無(wú)空格的數(shù)據(jù)
輸入好后,直接快捷鍵CTRL+E,即可自動(dòng)將D列填充為去除空格后的數(shù)據(jù)(注意要確保此時(shí)其他數(shù)據(jù)沒(méi)有別的空缺,否則會(huì)把別的空缺也自動(dòng)填充上)
?四萬(wàn)個(gè)數(shù)據(jù)不到一秒就全部填充好了,速度還是比較快的。然后直接把c列刪除即可。
如果是數(shù)據(jù)中包含特殊字符,想去除也可以用這個(gè)方法,只是對(duì)某幾列做操作時(shí),比調(diào)代碼快。
去除冗余數(shù)據(jù)
有些數(shù)據(jù),可能一整列都是同一個(gè)數(shù)據(jù),沒(méi)有變化,這些變量對(duì)機(jī)器學(xué)習(xí)沒(méi)有幫助,但因?yàn)閿?shù)據(jù)太多,不可能人工判斷每列數(shù)據(jù)情況,需要代碼調(diào)試進(jìn)行去除
for col in data.columns:
# 如果這一列所有的值都相等
if data[col].nunique() == 1:
# 則刪除這一列
data = data.drop(col, axis=1)
格式轉(zhuǎn)換
將true和false類型數(shù)據(jù)轉(zhuǎn)換為int型
data['11'] = data['11'].astype(int)
其他類型同理,將“int”改為想轉(zhuǎn)換的類型即可
字母或字符串轉(zhuǎn)換為對(duì)應(yīng)數(shù)字
如果想將字母或字符串轉(zhuǎn)換為數(shù)字在后續(xù)作為變量進(jìn)行處理,建議直接在excel中快捷鍵CTRL+h進(jìn)行轉(zhuǎn)換,代碼總是會(huì)出現(xiàn)各種錯(cuò)誤。
合并某幾列數(shù)據(jù)
如果給出的數(shù)據(jù)有年、月、日,需要將其合并成一列
data['timestamp'] = data['月'].astype(str) + '-' + data['日'].astype(str) + '-' + data['具體時(shí)間']
data = data.drop(['月', '日', '具體時(shí)間'], axis=1)
# 將時(shí)間戳列設(shè)置為索引
#data.set_index('timestamp', inplace=True)
data['time'] = pd.to_datetime('2023-' + data['timestamp'], format='%Y-%m-%d-%H:%M:%S')
對(duì)多列數(shù)據(jù)求平均值并合并為一列:
class_df = (data['ROLL_ATT1']+data['ROLL_ATT2'])/2
data['ROLL_ATT1']=class_df
data = data.drop(['ROLL_ATT2','MAGNETIC_HEADING'], axis=1)
數(shù)據(jù)可視化
可以將已處理好的數(shù)據(jù)進(jìn)行可視化
以某個(gè)自變量為橫坐標(biāo)(如時(shí)間),其余為縱坐標(biāo)畫出折線圖,畫出所有變量隨時(shí)間變化的折線圖:
import matplotlib.pyplot as plt
#畫出所有變量隨時(shí)間變化圖像
feature = data.columns[1:]
for feas in feature:
plt.plot(data['time'], data[feas])
plt.xlabel('Time')
plt.ylabel(feas)
plt.show()
判斷某個(gè)變量的分布情況可以繪制分布圖
# 繪制分布圖
plt.hist(df2['train1'], bins=20)
plt.xlabel('train1')
plt.ylabel('Frequency')
plt.title('Takeoff Weight Distribution')
plt.show()
?散點(diǎn)圖繪制
import seaborn as sns
sns.pairplot(data , hue ='label')
plt.savefig(r"D:\pairplot001.png")
效果如下:
也可以畫出相關(guān)性系數(shù)熱力圖
import seaborn as sns
sns.set(style="ticks")
sns.heatmap(data.corr(), annot=True, cmap="YlGnBu");
plt.savefig(r"D:\heatmap.png")
效果如下:
?
?針對(duì)機(jī)器學(xué)習(xí)及深度學(xué)習(xí)數(shù)據(jù)預(yù)處理
導(dǎo)入庫(kù)(keras庫(kù)的內(nèi)容是深度學(xué)習(xí)才需要用到的,僅進(jìn)行機(jī)器學(xué)習(xí)可以不導(dǎo)入)
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils, plot_model
import matplotlib.pyplot as pl
from sklearn import metrics
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPooling1D
from keras.models import model_from_json
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler
import itertools
首先是對(duì)數(shù)據(jù)集進(jìn)行劃分
data2=data.drop(['label'],axis=1)
X = np.expand_dims(data2.astype(float), axis=2)
Y = data['label']
print(X.shape)
print(Y.shape)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.7, random_state=20)
x_valid, x_test, y_test, y_valid=train_test_split(X, Y, test_size=0.5, random_state=20)
數(shù)據(jù)標(biāo)準(zhǔn)化
standard = StandardScaler()
# 對(duì)訓(xùn)練集進(jìn)行標(biāo)準(zhǔn)化,它會(huì)計(jì)算訓(xùn)練集的均值和標(biāo)準(zhǔn)差保存起來(lái)
x_train = standard.fit_transform(x_train)
# 使用標(biāo)準(zhǔn)化器在訓(xùn)練集上的均值和標(biāo)準(zhǔn)差,對(duì)測(cè)試集進(jìn)行歸一化
x_test = standard.transform(x_test)
接下來(lái)都是深度學(xué)習(xí)需要進(jìn)行的預(yù)處理操作,機(jī)器學(xué)習(xí)不用
深度學(xué)習(xí)需要對(duì)label進(jìn)行onehot編碼
from keras.utils import to_categorical
y_test = to_categorical(y_test)
y_train = to_categorical(y_train)
from keras import backend as K
K.set_image_dim_ordering("tf")
#one_hot編碼轉(zhuǎn)換
def one_hot(Train_Y, Test_Y):
Train_Y = np.array(Train_Y).reshape([-1, 1])
Test_Y = np.array(Test_Y).reshape([-1, 1])
Encoder = preprocessing.OneHotEncoder()
Encoder.fit(Train_Y)
Train_Y = Encoder.transform(Train_Y).toarray()
Test_Y = Encoder.transform(Test_Y).toarray()
Train_Y = np.asarray(Train_Y, dtype=np.int32)
Test_Y = np.asarray(Test_Y, dtype=np.int32)
return Train_Y, Test_Y
y_train, y_test = one_hot(y_train, y_test)
接下來(lái)需要根據(jù)訓(xùn)練參數(shù)進(jìn)行調(diào)整,假設(shè)訓(xùn)練參數(shù)設(shè)置如下
# 訓(xùn)練參數(shù)
batch_size = 128
epochs = 40 #訓(xùn)練輪數(shù)
num_classes = 6 #總共的訓(xùn)練類數(shù)
length = 2048
BatchNorm = False # 是否批量歸一化
number = 1000 # 每類樣本的數(shù)量
normal = False # 是否標(biāo)準(zhǔn)化
?重塑訓(xùn)練參數(shù),否則傳入模型時(shí)會(huì)出錯(cuò)
x_train=x_train.reshape((x_train.shape[0],x_train.shape[1],1))
x_test = x_test.reshape((x_test.shape[0], x_test.shape[1],1))
需要根據(jù)betch_size的大小改變選擇的訓(xùn)練數(shù)據(jù)數(shù)目,否則傳入模型訓(xùn)練會(huì)出錯(cuò)文章來(lái)源:http://www.zghlxwxcb.cn/news/detail-624193.html
# # 改變dataset的大小,變成batch_size的倍數(shù)
def change_dataset_size(x, y, batch_size):
length = len(x)
if (length % batch_size != 0):
remainder = length % batch_size
x = x[:(length - remainder)]
y = y[:(length - remainder)]
return x, y
x_train,y_train=change_dataset_size(x_train,y_train,batch_size)
x_valid, y_valid=change_dataset_size(x_valid, y_valid,batch_size)
這一步之后基本就可以直接傳入模型進(jìn)行訓(xùn)練了?文章來(lái)源地址http://www.zghlxwxcb.cn/news/detail-624193.html
到了這里,關(guān)于數(shù)據(jù)預(yù)處理方法整理(數(shù)學(xué)建模)的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!