MFCC(Mel Cepstrum Coefficient) C++ Code Implementation (mel Feature Extraction Ported to librosa)

Keywords: OpenCV Python github

I. Introduction

The abbreviation of Mel Frequency Cepstrum Coefficient is MFCC, which is widely used in automatic speech and speaker recognition.

Mel frequency is based on the auditory characteristics of the human ear, and it has a non-linear relationship with the frequency of Hz. Mel Frequency Cepstrum Coefficient (MFCC) is a kind of frequency spectrum characteristic calculated by utilizing the relationship between them.

After recording an analog speech signal with a recording device, the analog speech signal is converted to digital speech signal by sampling at a self-determined sampling frequency (such as 8000 Hz, 16000 Hz, etc.). Because the waveform change of speech signal in time domain is very fast and difficult to observe, it is usually observed in frequency domain. Its spectrum changes slowly with time. Therefore, it can be assumed that the characteristics of speech signal are stable in a relatively short period of time. Usually we define it. This relatively short time is a frame. According to the change of the pitch period value of human voice, it usually takes 10-20 Ms.

For concepts, please refer to: https://www.e-learn.cn/content/qita/798278
Be familiar with the operation process before you start. https://blog.csdn.net/zouxy09/article/details/9156785

2. python implementation

For a 2 second 22050 sampling rate file:

  • Overall time-consuming: about 200 ms
  • Except file loading time: 9ms

The program is not provided for the time being. The prompt is based on librosa.

C++ Implementation

1. Modify the program based on the blog: https://blog.csdn.net/LiuPeiP_VIPL/article/details/81742392
2. Transplantation according to the operation logic of librosa library on Python platform
3. Using NumCpp to implement NumPy of Python platform: https://github.com/dpilger26/NumCpp
4. The FFT operation in this example is very slow (about 160ms), which was later found on GitHub (about 6ms): https://github.com/HiFi-LoFi/AudioFFT
5. After verification, it was found that NumCpp was inefficient, so opencv was used to implement matrix operation.

For a 2 seconds 22050 sampling rate file (without considering file loading):

  • Overall time-consuming: 8ms
  • FFT time-consuming: 6ms
//
// Created by toson on 19-7-17.
//
// 1. Modification based on the blog program: https://blog.csdn.net/LiuPeiP_VIPL/article/details/81742392
// 2. Transplantation according to the operation logic of librosa library on Python platform
// 3. Using NumCpp to implement NumPy: https://github.com/dpilger26/NumCpp for Python platform
// 4. The FFT operation in this example is very slow (about 160ms), and later it was found on GitHub (about 6ms): https://github.com/HiFi-LoFi/AudioFFT.
// 5. After verification, it was found that NumCpp was inefficient, so opencv was used to implement matrix operation.

#include "utils/mfcc.h"
#include "utils.h"
#include "AudioFFT.h"
//#include <torch/torch.h>
//#include <cuda_runtime.h>
//#include <cublas_v2.h>

int nSamplesPerSec = 16000;                     //Sample rate // Sample rate.(keda, thchs30, aishell)
int length_DFT = 2048;                          //Fourier points //fft points (samples)
int hop_length = int(0.025 * nSamplesPerSec);   //Step // Right offset of next frame data relative to this frame
int win_length = int(0.1 * nSamplesPerSec);     //Frame Length//Assuming 16000 Sampling Rate, 0.1s Time Data is taken
int number_filterbanks = 80;                    //Number of filters //Number of Mel banks to generate
float preemphasis = 0.97;//0.95                 //or None
int max_db = 100;
int ref_db = 20;
int r = 1;                                      //r=1 in python
double pi = 3.14159265358979323846;
//int number_coefficients = 1025; // discrete transformation dimension, and finally 3*number_coefficients dimension feature data
//int number_feature_vectors; //How many frames does the. wav have?

nc::NdArray<double> mel_basis;
cv::Mat_<float> hannWindow;

//Implementation of FFT
void Fast_Fourier_Transform(int direction, int length, std::vector<double> &Xr, std::vector<double> &Xi) {
    int log_length = (int) (log((double) length) / log(2.0));

    double pi = 3.14159265358979323846;

    for (int i = 0, j = 0; i < length; i++, j = 0) {
        for (int k = 0; k < log_length; k++) {
            j = (j << 1) | (1 & (i >> k));
        }
        if (j < i) {
            double t;

            t = Xr[i];
            Xr[i] = Xr[j];
            Xr[j] = t;

            t = Xi[i];
            Xi[i] = Xi[j];
            Xi[j] = t;
        }
    }
    for (int i = 0; i < log_length; i++) {
        int L = (int) pow(2.0, i);

        for (int j = 0; j < length - 1; j += 2 * L) {
            for (int k = 0; k < L; k++) {
                double argument = direction * -pi * k / L;

                double xr = Xr[j + k + L] * cos(argument) - Xi[j + k + L] * sin(argument);
                double xi = Xr[j + k + L] * sin(argument) + Xi[j + k + L] * cos(argument);

                Xr[j + k + L] = Xr[j + k] - xr;
                Xi[j + k + L] = Xi[j + k] - xi;
                Xr[j + k] = Xr[j + k] + xr;
                Xi[j + k] = Xi[j + k] + xi;
            }
        }
    }
    if (direction == -1) {
        for (int k = 0; k < length; k++) {
            Xr[k] /= length;
            Xi[k] /= length;
        }
    }
}

void FFT(int direction, int length, std::vector<double> &Xr, std::vector<double> &Xi) {
    int log_length = int(log((double) length) / log(2.0));

    if (direction != 1 && direction != -1) {
        //fprintf(stderr, "[FFT], [direction = {-1 (inversed transform), 1 (forward transform)}\n");
        return;
    }
    if (1 << log_length != length) {
        //fprintf(stderr, "[FFT], [length must be a power of 2]\n");
        return;
    }
    Fast_Fourier_Transform(direction, length, Xr, Xi);
}

double hz_to_mel(float frequencies, bool htk = false) {
    if (htk) {
        return 2595.0 * nc::log(1.0 + frequencies / 700.0);
    }
    // Fill in the linear part
    float f_min = 0.0;
    auto f_sp = float(200.0 / 3);
    double mels = (frequencies - f_min) / f_sp;
    // Fill in the log-scale part
    float min_log_hz = 1000.0;                         // beginning of log region (Hz)
    float min_log_mel = (min_log_hz - f_min) / f_sp;   // same (Mels)
    double logstep = nc::log(6.4) / 27.0;              // step size for log region

    // Porting against the librosa Library of Python platform
//    if (frequencies.ndim) {
//        // If we have array data, vectorize
//        log_t = (frequencies >= min_log_hz)
//        mels[log_t] = min_log_mel + np.log(frequencies[log_t] / min_log_hz) / logstep
//    } else
    if (frequencies >= min_log_hz) {
        // If we have scalar data, heck directly
        mels = min_log_mel + nc::log(frequencies / min_log_hz) / logstep;
    }
    return mels;
}

nc::NdArray<double> mel_to_hz(nc::NdArray<double> mels, bool htk = false) {
    // Fill in the linear scale
    float f_min = 0.0;
    auto f_sp = float(200.0 / 3);
    nc::NdArray<double> freqs = mels * f_sp + f_min;
    // And now the nonlinear scale
    float min_log_hz = 1000.0;                         // beginning of log region (Hz)
    float min_log_mel = (min_log_hz - f_min) / f_sp;   // same (Mels)
    double logstep = nc::log(6.4) / 27.0;              // step size for log region
    //If (mels. ndim) {// NC does not have. ndim
    // If we have vector data, vectorize
    auto log_t = (mels >= min_log_mel);
    for (int i = 0; i < log_t.shape().cols; i++) {
        if (log_t[i]) {
            freqs[i] = nc::exp((mels[i] - min_log_mel) * logstep) * min_log_hz;
        }
    }
    //}
    return freqs;
}

nc::NdArray<double> mel_spectrogram_create(int nps, int n_fft, int n_mels) {
    auto time1 = get_milli_time();
    auto fmax = float(nps / 2.0);
    float fmin = 0;
    int n_fft_2 = 1 + n_fft / 2;
    // Initialize the weights
    auto weights = nc::zeros<double>(nc::uint32(n_mels), nc::uint32(n_fft_2));
    // Center freqs of each FFT bin
    auto fftfreqs = nc::linspace<double>(0, nps / 2.0, nc::uint32(n_fft_2), true); //Equivariant queue
    auto time2 = get_milli_time();
    std::cout << "\ttime1-2 cost: " << time2 - time1 << std::endl;

    // 'Center freqs' of mel bands - uniformly spaced between limits
    double min_mel = hz_to_mel(fmin, false);
    double max_mel = hz_to_mel(fmax, false);
    auto mels = nc::linspace(min_mel, max_mel, nc::uint32(n_mels + 2));
    auto mel_f = mel_to_hz(mels, false);
    auto time3 = get_milli_time();
    std::cout << "\ttime2-3 cost: " << time3 - time2 << std::endl;

    auto fdiff = nc::diff(mel_f); //Calculate the discrete difference of dimension N along the specified axis (the latter element subtracts the former element)
    auto time4 = get_milli_time();
    std::cout << "\tnc::diff cost: " << time4 - time3 << std::endl;

    //auto ramps = nc::subtract.outer(mel_f, fftfreqs); //nc does not have subtract.outer
    nc::NdArray<double> ramps = nc::zeros<double>(mel_f.shape().cols, fftfreqs.shape().cols);
    for (int i = 0; i < mel_f.shape().cols; i++) {
        for (int j = 0; j < fftfreqs.shape().cols; j++) {
            ramps(i, j) = mel_f[i] - fftfreqs[j];
        }
    }
    auto time5 = get_milli_time();
    std::cout << "\ttime4-5 cost: " << time5 - time4 << std::endl;

    for (int i = 0; i < n_mels; i++) {
        // lower and upper slopes for all bins
        auto ramps_1 = nc::NdArray<double>(1, ramps.shape().cols);
        for (int j = 0; j < ramps.shape().cols; j++) {
            ramps_1[j] = ramps(i, j);
        }
        auto ramps_2 = nc::NdArray<double>(1, ramps.shape().cols);
        for (int j = 0; j < ramps.shape().cols; j++) {
            ramps_2[j] = ramps(i + 2, j);
        }
        auto lower = ramps_1 * -1 / fdiff[i];
        auto upper = ramps_2 / fdiff[i + 1];
        // .. then intersect them with each other and zero
        auto weights_1 = nc::maximum(nc::zeros<double>(1, ramps.shape().cols), nc::minimum(lower, upper));
        for (int j = 0; j < n_fft_2; j++) {
            weights(i, j) = weights_1[j];
        }
    }
    auto time6 = get_milli_time();
    std::cout << "\ttime5-6 cost: " << time6 - time5 << std::endl;

    // Slaney-style mel is scaled to be approx constant energy per channel
    auto enorm = nc::NdArray<double>(1, nc::uint32(n_mels));
    for (int j = 0; j < n_mels; j++) {
        enorm[j] = 2.0 / (mel_f[j + 2] - mel_f[j]);
    }
    for (int j = 0; j < n_mels; j++) {
        for (int k = 0; k < n_fft_2; k++) {
            weights(j, k) *= enorm[j];
        }
    }
    auto time7 = get_milli_time();
    std::cout << "\tend cost: " << time7 - time6 << std::endl;
    return weights;
}

//nc::NdArray<double> MagnitudeSpectrogram(const nc::NdArray<double> *emphasis_data, int n_fft = 2048, int hop_length = 0,
//                                         int win_length = 0) {
cv::Mat_<double> MagnitudeSpectrogram(const cv::Mat_<float> *emphasis_data, int n_fft = 2048, int hop_length = 0,
                                      int win_length = 0) {
    if (win_length == 0) {
        win_length = n_fft;
    }
    if (hop_length == 0) {
        hop_length = win_length / 4;
    }

    //Ref Symmetric Filling//2ms
    int pad_lenght = n_fft / 2;
    auto timein1 = get_milli_time();
    //nc::pad() will also fill two-dimensional, 2049*45398, and can not complete reflect ion filling, so can only write their own.
    //auto ncbuffer_pad = nc::pad(ncbuffer, nc::uint16(pad_lenght), 0.0);
//    auto padbuffer = nc::NdArray<double>(1, (*emphasis_data).size() + n_fft) = 0.0;
//    padbuffer.put(nc::Slice(pad_lenght, pad_lenght + (*emphasis_data).size()), *emphasis_data);
//    int padbuf_index = padbuffer.size() - 1;
//    for (int i = 1; i <= pad_lenght; i++) {
//        padbuffer[pad_lenght - i] = padbuffer[pad_lenght + i];
//        padbuffer[padbuf_index - (pad_lenght - i)] = padbuffer[padbuf_index - (pad_lenght + i)];
//    }
//    std::cout << "\033[33m" << std::endl;
//    for (int i = 0; i < padbuffer.size(); i++) {
//        std::cout << padbuffer[i] << ", " << std::endl;
//    }
//    std::cout << "\033[0m" << std::endl;
//    auto timein12 = get_milli_time();
//    std::cout << "\treflect cost: " << timein12 - timein1 << std::endl;
    //Use copyMakeBorder in opencv to complete reflect ion filling
    //cv::Mat cv_padbuffer(emphasis_data->shape().rows, emphasis_data->shape().cols, CV_64FC1, emphasis_data->data());
    cv::Mat_<float> cv_padbuffer;
//    (*emphasis_data).convertTo(cv_padbuffer, CV_32FC1);
    cv::copyMakeBorder(*emphasis_data, cv_padbuffer, 0, 0, pad_lenght, pad_lenght, cv::BORDER_REFLECT_101);
//    std::cout << "\033[34m" << std::endl;
//    for (int i = 0; i < cv_padbuffer.rows * cv_padbuffer.cols; i++) {
//        std::cout << cv_padbuffer.at<float>(i) << ", " << std::endl;
//    }
//    std::cout << "\033[0m" << std::endl;

    auto timein11 = get_milli_time();
    std::cout << "\tcopyMakeBorder cost: " << timein11 - timein1 << std::endl;

    // windowing: Multiply each frame by Hanning window to increase the continuity of left and right frames. //0ms
    // Generate a 1600-length hannWindow with a median length of 2048
    if (hannWindow.empty()) {
        hannWindow = cv::Mat_<float>(1, n_fft, 0.0f);
        int insert_cnt = 0;
        if (n_fft > win_length) {
            insert_cnt = (n_fft - win_length) / 2;
        } else {
            std::cout << "\tn_fft:" << n_fft << " > win_length:" << n_fft << std::endl;
            return cv::Mat_<double>(0);
        }
        for (int k = 1; k <= win_length; k++) {
            hannWindow(0, k - 1 + insert_cnt) = float(0.5 * (1 - cos(2 * pi * k / (win_length + 1))));
        }
    }
    //Although opencv has Hann window generating function, it must require width > 1, height > 1
//    cv::Mat_<double> cv_hannWindow;
//    cv::createHanningWindow(cv_hannWindow, cv::Size(1, win_length), CV_64FC1);

//    int size = padbuffer.size();
    int size = cv_padbuffer.rows * cv_padbuffer.cols;//padbuffer.size()
    int number_feature_vectors = (size - n_fft) / hop_length + 1;
    int number_coefficients = n_fft / 2 + 1;
//    std::vector<std::vector<float>> feature_vector((unsigned long) number_feature_vectors,
//                                                   std::vector<float>(number_coefficients * 2, 0));
    cv::Mat_<float> feature_vector(number_feature_vectors, number_coefficients, 0.0f);
    //auto feature_array = nc::empty<double>(nc::uint32(number_feature_vectors), nc::uint32(n_fft));
    auto timein2 = get_milli_time();
    //10ms
    std::cout << "\twindowing cost: " << timein2 - timein11 << std::endl;
    for (int i = 0; i <= size - n_fft; i += hop_length) {
//        std::vector<float> framef(n_fft, 0);
//        for (int j = 0; j < n_fft; j++) {
//            Framef [j] = cv_padbuffer.at < double_t > (i + j); / / padbuffer [i + j]; / / fetch a piece of data at a time
//        }
        cv::Mat_<float> framef = cv::Mat_<float>(1, n_fft, (float *) (cv_padbuffer.data) + i).clone();
        // Add hann window
//        for (int j = 0; j < n_fft; j++) {
//            framef[j] *= hannWindow[j];
//        }
        framef = framef.mul(hannWindow);
        // Complex: Xr real number, Xi imaginary number.
//        std::vector<double> Xr(n_fft, 0);
//        std::vector<double> Xi(n_fft, 0);
//        for (int k = 0; k < n_fft; k++) {
//            Xr[k] = (k < n_fft) ? (frame[k]) : (0);
//            Xi[k] = 0;
//        }
        //FFT(1, n_fft, Xr, Xi);
        // The above FFT operation takes too long (more than 160 ms of total circulation), using the following method (9 ms of total circulation)
        audiofft::AudioFFT fft;
        fft.init(size_t(n_fft));
        cv::Mat_<float> Xrf(1, number_coefficients);
        cv::Mat_<float> Xif(1, number_coefficients);
//        std::vector<float> Xif(n_fft, 0);
        fft.fft((float *) (framef.data), (float *) (Xrf.data), (float *) (Xif.data));

//        {// fft using opencv crashes
//            cv::Mat I = imread("/d/images/people.jpg", cv::IMREAD_GRAYSCALE); //read in image grayscale
//            cv::Mat padded; // Fill in the input image matrix with 0
//            int m = cv::getOptimalDFTSize(I.rows);
//            int n = cv::getOptimalDFTSize(I.cols);
//            // Fill in the input image I, the input matrix is padded, and no filling is done on the top and left sides.
//            copyMakeBorder(I, padded, 0, m - I.rows, 0, n - I.cols, cv::BORDER_CONSTANT, cv::Scalar::all(0));
//            cv::Mat planes[] = { cv::Mat_<float>(padded), cv::Mat::zeros(padded.size(),CV_32F) };
//            cv::Mat complexI;
//            merge(planes, 2, complexI); // merge planes into a multi-channel array complexI
//            cv::dft(complexI, complexI); // Fourier transform
//        }

        // Seeking module
        for (int k = 0; k < number_coefficients; k++) {
//            //feature_vector[i / hop_length][k] = sqrt(Xr[k] * Xr[k] + Xi[k] * Xi[k]);
//            feature_vector[i / hop_length][k] = sqrt(Xrf[k] * Xrf[k] + Xif[k] * Xif[k]);
        }
        cv::pow(Xrf, 2, Xrf);
        cv::pow(Xif, 2, Xif);
        cv::Mat_<float> cv_feature(1, number_coefficients, &(feature_vector[i / hop_length][0]));
        cv::sqrt(Xrf + Xif, cv_feature);
    }
    auto timein3 = get_milli_time();
    //2ms
    std::cout << "\tFFT cost: " << timein3 - timein2 << std::endl;
//    auto mag = nc::NdArray<double>(nc::uint32(number_coefficients), nc::uint32(number_feature_vectors));
//    for (int i = 0; i < number_coefficients; i++) {
//        for (int j = 0; j < number_feature_vectors; j++) {
//            mag(i, j) = feature_vector[j][i];
//        }
//    }
    cv::Mat_<float> cv_mag;
    cv::transpose(feature_vector, cv_mag);
    cv::Mat_<double> mag;
    cv_mag.convertTo(mag, CV_64FC1);

    auto timein4 = get_milli_time();
    std::cout << "\tend cost: " << timein4 - timein3 << std::endl;
    return mag;
}

//nc::NdArray<float> mfcc(std::vector<char> ifile_data, int nSamples_per_sec) {
cv::Mat mfcc(std::vector<char> ifile_data, int nSamples_per_sec) {
    auto time1 = get_milli_time();
    if (nSamples_per_sec != nSamplesPerSec) {
        std::cout << R"(the "nSamples_per_sec" is not 16000.)" << std::endl;
        cv::Mat mat;
        return mat;
    }
    int ifile_length = int(ifile_data.size() / 4);
    // Pre-emphasis, pre-emphasis//0ms
//    auto emphasis_data = nc::NdArray<double>(1, nc::uint32(ifile_length)) = 0.0;
//    emphasis_data[0] = *(float *) (ifile_data.data());
//    for (int i = 1; i < ifile_length; i++) {
//        emphasis_data[i] = *(float *) (&ifile_data[i * 4]) - preemphasis * *(float *) (&ifile_data[(i - 1) * 4]);
//    }
    //Use opencv to try
    cv::Mat_<float> d1(1, ifile_length - 1, (float *) (ifile_data.data()) + 1);
    cv::Mat_<float> d2(1, ifile_length - 1, (float *) (ifile_data.data()));
    cv::Mat_<float> cv_emphasis_data;
    cv::hconcat(cv::Mat_<float>::zeros(1, 1), d1 - d2 * preemphasis, cv_emphasis_data);

    auto time2 = get_milli_time();
    std::cout << "pre-emphasis cost: " << time2 - time1 << std::endl;

    // magnitude spectrogram amplitude spectrogram //13ms
    auto mag = MagnitudeSpectrogram(&cv_emphasis_data, length_DFT, hop_length, win_length);
    auto time3 = get_milli_time();
    std::cout << "MagnitudeSpectrogram cost: " << time3 - time2 << std::endl;

    // Generating Mel spectrogram //5ms
    if (mel_basis.isempty()) {
        mel_basis = mel_spectrogram_create(nSamplesPerSec, length_DFT, number_filterbanks);
    }
    auto time4 = get_milli_time();
    std::cout << "mel_calc cost: " << time4 - time3 << std::endl;

//    auto mel = nc::dot<double>(mel_basis, mag);     //20ms
    //The above calculation takes too long. I use opencv to do it. //2ms
    cv::Mat cv_mel_basis(mel_basis.shape().rows, mel_basis.shape().cols, CV_64FC1, mel_basis.data());
    cv::Mat cv_mel = cv_mel_basis * mag;
//    auto mel = nc::NdArray<double>(cv_mel.rows, cv_mel.cols);
//    for (int i = 0; i < cv_mel.rows; i++) {
//        for (int j = 0; j < cv_mel.cols; j++) {
//            mel(i, j) = cv_mel.at<double_t>(i, j);
//        }
//    }
    //Then try pytorch. //62ms
//    at::TensorOptions options(at::ScalarType::Double);
//    auto torch_mel_basis = torch::from_blob(mel_basis.data(), {mel_basis.shape().rows, mel_basis.shape().cols},
//                                            options).to(at::kCPU);
//    torch_mel_basis.print();
//    auto torch_mag = torch::from_blob(mag.data(), {mag.shape().rows, mag.shape().cols}, options).to(at::kCPU);
//    auto torch_mel = torch::matmul(torch_mel_basis, torch_mag);
//    auto mel = nc::NdArray<double>(mel_basis.shape().rows, mag.shape().cols);
//    for (int i = 0; i < mel_basis.shape().rows; i++) {
//        for (int j = 0; j < mag.shape().cols; j++) {
//            mel(i, j) = torch_mel[i][j].item().toDouble();
//        }
//    }

    auto time5 = get_milli_time();
    std::cout << "nc::dot cost: " << time5 - time4 << std::endl;


    // to decibel           //2ms
    //mel = nc::log10(nc::maximum(mel, nc::NdArray<double>(mel.shape().rows, mel.shape().cols) = 1e-5)) * 20;
    //mag = nc::log10(nc::maximum(mag, nc::NdArray<double>(mag.shape().rows, mag.shape().cols) = 1e-5)) * 20;
    //Try to use opencv to implement //0ms
    cv::log(cv::max(cv_mel, 1e-5), cv_mel);
    //opencv does not have log10(), so log(x)/log(10) is used to calculate.
    cv_mel = cv_mel / 2.3025850929940459 * 20; //2.3025850929940459=log(10) natural logarithm
//    auto mel = nc::NdArray<double>(nc::uint32(cv_mel.rows), nc::uint32(cv_mel.cols));
//    for (int i = 0; i < cv_mel.rows; i++) {
//        for (int j = 0; j < cv_mel.cols; j++) {
//            mel(i, j) = cv_mel.at<double_t>(i, j);
//        }
//    }
    //Try pytorch to implement //47ms
//    at::TensorOptions options(at::ScalarType::Double);
//    auto torch_mel = torch::from_blob(mel.data(), {mel.shape().rows, mel.shape().cols}, options).to(at::kCPU);
//    auto torch_mag = torch::from_blob(mag.data(), {mag.shape().rows, mag.shape().cols}, options).to(at::kCPU);
//    torch_mel = torch::log10(torch::max(torch_mel, torch::tensor(1e-5))) * 20;
//    torch_mag = torch::log10(torch::max(torch_mag, torch::tensor(1e-5))) * 20;

    auto time6 = get_milli_time();
    std::cout << "nc::log10 cost: " << time6 - time5 << std::endl;

    // normalize        //1ms
//    mel = nc::clip((mel - ref_db + max_db) / max_db, 1e-8, 1.0);
//    mel.print();
    //mag = nc::clip((mag - ref_db + max_db) / max_db, 1e-8, 1.0);
    //Using normalize of opencv to implement //ms
    cv_mel = (cv_mel - ref_db + max_db) / max_db;
    //cv::normalize(cv_mel, cv_mel, 1e-8, 1.0, cv::NORM_MINMAX);//cv::normalize cannot be achieved
    cv_mel = cv::max(cv::min(cv_mel, 1.0), 1e-8);
    //cv::threshold(cv_mel, cv_mel, 1e-8, 1.0, cv::THRESH_BINARY); // Comparing with the threshold, the conformity is equal to the set value.
//    auto mel = nc::NdArray<double>(nc::uint32(cv_mel.rows), nc::uint32(cv_mel.cols));
////    std::cout << "\033[33m" << std::endl;
//    for (int i = 0; i < cv_mel.rows; i++) {
//        for (int j = 0; j < cv_mel.cols; j++) {
//            mel(i, j) = cv_mel.at<double_t>(i, j);
////            std::cout << mel(i, j) << ", ";
//        }
//    }
//    std::cout << "\033[0m" << std::endl;

    auto time7 = get_milli_time();
    std::cout << "nc::clip cost: " << time7 - time6 << std::endl;

    // Transpose
//    auto mel_r = nc::transpose(mel).astype<float>();
    //auto mag_r = nc::transpose(mag).astype<float>();
    // Use opencv transpose to try
    cv::Mat cv_mel_r;
    cv::transpose(cv_mel, cv_mel_r);
//    auto mel_r = nc::NdArray<double>(nc::uint32(cv_mel_r.rows), nc::uint32(cv_mel_r.cols));
//    for (int i = 0; i < cv_mel_r.rows; i++) {
//        for (int j = 0; j < cv_mel_r.cols; j++) {
//            mel_r(i, j) = cv_mel_r.at<double_t>(i, j);
//        }
//    }
    cv_mel_r.convertTo(cv_mel_r, CV_32FC1);

    auto time8 = get_milli_time();
    std::cout << "nc::transpose cost: " << time8 - time7 << std::endl;

    // r=1 in python
    if (r == 1) {
        // When r=1, there is no numerical change in the formula operation.
//            mel = mel[:len(mel) / r * r].reshape([len(mel) / r, r * number_filterbanks]);
//            mag = mag[:len(mag) / r * r];
    } else {
        std::cout << R"(the "r" is not 1.)" << std::endl;
    }
    auto time9 = get_milli_time();
    std::cout << "end cost: " << time9 - time8 << std::endl;
    // Return mel eigenvector
    return cv_mel_r;
}

Posted by kamurj on Thu, 01 Aug 2019 22:28:01 -0700