Data mining for disk damage prediction

Keywords: Python Machine Learning Data Mining Mathematical Modeling

Data import and preprocessing

The first is to import all packages required for this data mining

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import svm
import seaborn as sns

Next, we import disk data and perform basic processing on the data

file_path='data\smart dataset sample.csv'
df=pd.read_csv(file_path)
data=df.iloc[:,3:]
X=data.iloc[:,1:]
Y=data.iloc[:,0]

# Delete samples with empty features
X.dropna(how='all',inplace=True)
Y=Y.iloc[X.index]

X=X.reset_index(drop=True)
Y=Y.reset_index(drop=True)

The data table df imported at the beginning is shown in the following figure

There are 14 fields and 11036 samples. We first remove the first three useless fields to get a new data table data, and then separate the smart feature and label failure into X and Y

For X, we first delete the disk samples with all empty eigenvalues, delete one sample in total, then synchronously update the data sets X and y, and finally reset the index es of X and Y. the obtained X and y are shown in the figure below

X dataset

Y dataset

Missing value filling

We have a variety of missing value filling methods (mean filling, mode filling, regression filling, self coding neural network filling)

What surprises me is that the data set filled with mean is in cross_ val_ In the score evaluation, the performance is the best. I will analyze this later

KNN_classifier filling

# KNN regression filling vacancy value
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor

def knn_missing_filled(train_x,train_y,test,k=3,dispersed=True):
    if dispersed:
        clf=KNeighborsClassifier(n_neighbors=k,weights='distance',n_jobs=-1)
    else:
        clf=KNeighborsRegressor(n_neighbors=k,weights='distance',n_jobs=-1)
    clf.fit(train_x,train_y)
    return test.index,clf.predict(test)

RandomForest

# randomforest fill gap value

from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier  # Random forest regression

def randomforest_missing_filled(train_x,train_y,test,dispersed=True):
    if dispersed:
        clf=RandomForestClassifier(n_estimators=10)
    else:
        clf=RandomForestRegressor(n_estimators=10)
    clf.fit(train_x,train_y)
    return test.index,clf.predict(test)

Self coding neural network

The model example of self coding neural network is shown in the figure above. It encode s and compresses the input, extracts important features, then decode s and restores the data. In essence, it learns a function f W , b ( X ) = X f_{W,b} (X)=X fW,b​(X)=X.

We can use the complete data as the training set X X The X-input self coding neural network is used for learning, and the distribution and characteristics of the data are extracted through continuous iteration, and then we will extract the data blocks with missing values X ′ X' The missing value of X 'is filled with random numbers, and then X ′ X' X 'is input into the network for multiple iterations, and the data set is gradually transformed by using the distribution characteristics of the data in the complete data set extracted in the network and other important features extracted X ′ X' X 'restore to fill the data

import random
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from keras.objectives import mse
from keras.models import Sequential
from keras.layers.core import Dropout, Dense
from keras.regularizers import l1_l2
from collections import defaultdict
%matplotlib inline
def make_reconstruction_loss(n_features):
    '''Reconstruction error'''
    def reconstruction_loss(input_and_mask, y_pred):
        X_values = input_and_mask[:, :n_features]
        X_values.name = "$X_values"
        missing_mask = input_and_mask[:, n_features:]
        missing_mask.name = "$missing_mask"
        observed_mask = 1 - missing_mask
        observed_mask.name = "$observed_mask"
        X_values_observed = X_values * observed_mask
        X_values_observed.name = "$X_values_observed"
        pred_observed = y_pred * observed_mask
        pred_observed.name = "$y_pred_observed"
        return mse(y_true=X_values_observed, y_pred=pred_observed)
    return reconstruction_loss
def masked_mae(X_true, X_pred, mask):
    masked_diff = X_true[mask] - X_pred[mask]
    return np.mean(np.abs(masked_diff))
class Autoencoder:
    def __init__(self, data,
                 recurrent_weight=0.5,
                 optimizer="adam",
                 dropout_probability=0.5,
                 hidden_activation="relu",
                 output_activation="sigmoid",
                 init="glorot_normal",
                 l1_penalty=0,
                 l2_penalty=0):
        self.data = data.copy()
        self.recurrent_weight = recurrent_weight
        self.optimizer = optimizer
        self.dropout_probability = dropout_probability
        self.hidden_activation = hidden_activation
        self.output_activation = output_activation
        self.init = init
        self.l1_penalty = l1_penalty
        self.l2_penalty = l2_penalty
    def _get_hidden_layer_sizes(self):
        n_dims = self.data.shape[1]
        return [
            min(2000, 8 * n_dims),
            min(500, 2 * n_dims),
            int(np.ceil(0.5 * n_dims)),
        ]
    def _create_model(self):
        hidden_layer_sizes = self._get_hidden_layer_sizes()
        first_layer_size = hidden_layer_sizes[0]
        n_dims = self.data.shape[1]
        model = Sequential()
        model.add(Dense(
            first_layer_size,
            input_dim= 2 * n_dims,
            activation=self.hidden_activation,
            W_regularizer=l1_l2(self.l1_penalty, self.l2_penalty),
            init=self.init))
        model.add(Dropout(self.dropout_probability))
        for layer_size in hidden_layer_sizes[1:]:
            model.add(Dense(
                layer_size,
                activation=self.hidden_activation,
                W_regularizer=l1_l2(self.l1_penalty, self.l2_penalty),
                init=self.init))
            model.add(Dropout(self.dropout_probability))
        model.add(Dense(
            n_dims,
            activation=self.output_activation,
            W_regularizer=l1_l2(self.l1_penalty, self.l2_penalty),
            init=self.init))
        loss_function = make_reconstruction_loss(n_dims)
        model.compile(optimizer=self.optimizer, loss=loss_function)
        return model
    def fill(self, missing_mask):
        self.data[missing_mask] = -1
    def _create_missing_mask(self):
        if self.data.dtype != "f" and self.data.dtype != "d":
            self.data = self.data.astype(float)
        return np.isnan(self.data)
    def _train_epoch(self, model, missing_mask, batch_size):
        input_with_mask = np.hstack([self.data, missing_mask])
        n_samples = len(input_with_mask)
        n_batches = int(np.ceil(n_samples / batch_size))
        indices = np.arange(n_samples)
        np.random.shuffle(indices)
        X_shuffled = input_with_mask[indices]
        for batch_idx in range(n_batches):
            batch_start = batch_idx * batch_size
            batch_end = (batch_idx + 1) * batch_size
            batch_data = X_shuffled[batch_start:batch_end, :]
            model.train_on_batch(batch_data, batch_data)
        return model.predict(input_with_mask)
    def train(self, batch_size=256, train_epochs=100):
        missing_mask = self._create_missing_mask()
        self.fill(missing_mask)
        self.model = self._create_model()
        observed_mask = ~missing_mask
        for epoch in range(train_epochs):
            X_pred = self._train_epoch(self.model, missing_mask, batch_size)
            observed_mae = masked_mae(X_true=self.data,
                                    X_pred=X_pred,
                                    mask=observed_mask)
            if epoch % 50 == 0:
                print("observed mae:", observed_mae)
            old_weight = (1.0 - self.recurrent_weight)
            self.data[missing_mask] *= old_weight
            pred_missing = X_pred[missing_mask]
            self.data[missing_mask] += self.recurrent_weight * pred_missing
        return self.data.copy()

The filling process is to adopt complete features
'smart_5_raw','smart_187_raw','smart_188_raw','smart_197_raw','smart_198_raw'

And the corresponding label failure is used as the training set to train the learner to learn the hypothesis function f ( x ) f(x) f(x) and calculate the value of vacancy value

The specific process is as follows:

  • Feature smart_198_raw as an example, the data table is divided into two data sets according to this column, the feature smart_ 198_ If raw is not empty, it will be the training set, and if raw is empty, it will be the test set to be predicted

  • Finally, any learning model of sklearn library is used to predict and fill, and the completed data set is obtained

  • Other features with missing values are also operated according to the above process

X filled with knn

X filled with mean

Feature distribution visualization

Then, after filling in and obtaining the complete data sets X and Y, we perceptually recognize the data by observing the distribution of features

Visual code

%matplotlib inline
def hist_attribute(df,bin=150):
    '''See how the features are distributed'''
    plt.figure(figsize=[20,25])
    plt.title('feature distribution')
    for i in range(len(df.columns)):
        title=df.columns[i]
        plt.subplot(5,2,i+1)
        plt.hist(df.iloc[:,i].values,bins=bin)
        plt.title(title)
        plt.xlabel('value')
        plt.ylabel('count')

Distribution of features filled with mean / mode

Feature distribution filled by knn/randomForest / self coding neural network

Basically, it can be found that the distribution of the first five features in the figure above is extremely unbalanced, which means that this feature is closely related to whether the hard disk is damaged or not

Explanation: some smart values record the number of abnormal behaviors (restart, high-intensity read-write, etc.) of the hard disk. If these times are small or even 0, it means that the hard disk is working normally. If they are large, it means that the hard disk is damaged

For the latter five features

  • For smart_193_raw, which obeys exponential distribution, has a high correlation with sample labels
  • For the following four features, smart_241_raw, smart_242_raw, smart_9_raw, smart_194_raw, which all obey the single Gaussian distribution or the mixed distribution of the combination of two Gaussian distributions, is the natural feature of the disk sample and has low correlation with the label of the sample (natural random events generally obey the Gaussian distribution)




    Viewing correlations between features
%matplotlib inline
complete_features=X
# Draw thermal diagram
plt.figure(figsize=(10,10))
sns.heatmap(complete_features.corr(),annot=True)


It can be seen that the correlation between the characteristics of the data is not particularly strong and will not have a great impact on the classification task

normalization

# normalization
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler

# X=all_log(X)

ss=StandardScaler()
ms=MaxAbsScaler()
X_norm=ss.fit_transform(X)
# X_norm=ms.fit_transform(X)
y_norm=np.array(Y.values).reshape(-1,1)

PCA, dimensionality reduction, sample point distribution, visualization

def plot_pca(num,data,label):
    pca=PCA(n_components=num)
    X_pca=pca.fit_transform(data)
    # Split data
    X_failure=np.array([x for i,x in enumerate(X_pca) if label[i][0]==1])
    X_healthy=np.array([x for i,x in enumerate(X_pca) if label[i][0]==0])
    
    if num==3:
        fig = plt.figure()
        ax = Axes3D(fig)   
        #ax.legend(loc='best')
        ax.set_zlabel('Z', fontdict={'size': 15, 'color': 'red'})
        ax.set_ylabel('Y', fontdict={'size': 15, 'color': 'red'})
        ax.set_xlabel('X', fontdict={'size': 15, 'color': 'red'})
        ax.scatter(X_failure[:,0], X_failure[:,1], X_failure[:,2])
        ax.scatter(X_healthy[:,0], X_healthy[:,1], X_healthy[:,2])
    elif num==2:
        plt.figure(figsize=[10,10])
        plt.scatter(X_failure[:,0],X_failure[:,1])
        plt.scatter(X_healthy[:,0],X_healthy[:,1])
    else:
        print('i do not want to work.....')
# Dimensionality reduction visualization
%matplotlib auto
plot_pca(num=3,data=X_norm,label=y_norm)
  • The visual image of sample distribution is filled with mean / mode, normalized by StandardScaler and reduced to 3D by PCA

It can be found that there is roughly a hyperplane in the three-dimensional feature space, which can be divided

  • The visual image of sample distribution is filled with mean / mode, normalized by MaxAbsScaler and reduced to 3D by PCA

It can be seen that the samples are basically not linearly classified (paste into a cluster), and it is speculated that the normalization method destroys the distribution of the data

  • KNN/RandomForest / self coding neural network is used to fill the sample distribution visual image, which is normalized by StandardScaler and reduced to 3D by PCA

Compared with the data filled with mean / mode, the dimension with obvious distinction between the distribution of failed samples and health samples is destroyed, which is the direct reason for adopting mean filling

Accuracy test of different learners for data set

In this experiment, five kinds of learners are used to learn the data set

Conclusion:

Accuracy: 8 hidden layer fully connected neural network > (randomforest > decisiontree > extratree) > SVM_ classiier

In the following table, the left side is the method used to fill in the missing value, and the above is the method used for normalization



Test code

  • Tree model and SVM model
import warnings
warnings.filterwarnings("ignore")
clf_1 = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
clf_2 = DecisionTreeClassifier(max_depth=None, min_samples_split=2,random_state=0)
clf_3 = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
# clf_4=svm.SVC()
# clf_4 = svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, 
#                 probability=False, tol=0.001, cache_size=200, class_weight=None, 
#                 verbose=False, max_iter=-1, decision_function_shape='ovr', 
#                 random_state=None)

scores1 = cross_val_score(clf_1, X_norm, y_norm, cv=5)
scores2 = cross_val_score(clf_2, X_norm, y_norm, cv=5)
scores3 = cross_val_score(clf_3, X_norm, y_norm, cv=5)
# scores4 = cross_val_score(clf_4, X_norm, y_norm, cv=5)

print('RandomForestClassifier scores: ',scores1.mean())
print('DecisionTreeClassifier scores: ',scores2.mean())
print('ExtraTreesClassifier scores: ',scores3.mean())
# print('SVC scores: ',scores4.mean())
  • 8 hidden layer neural network model
# Try neural network classification
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import KFold
seed = 7
np.random.seed(seed)
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(8, input_dim=10, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    # Compile model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)
# splitting data into training set and test set. If random_state is set to an integer, the split datasets are fixed.
X_train, X_test, Y_train, Y_test = train_test_split(X_norm, y_norm, test_size=0.3, random_state=0)
estimator.fit(X_train, Y_train)

kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
scores5 = cross_val_score(estimator, X_norm, y_norm, cv=kfold)
print('nerulNetWork scores: ',scores5.mean())

summary

This is the author's first relatively complete data mining experiment. If there are deficiencies, please don't hesitate to give advice!

Posted by goaman on Mon, 11 Oct 2021 22:14:02 -0700