first pass phase training, no t overly impressed by results, but basic frameowkr doing sensible things, e.g. overtraining on small, all_8k database

2019-12-07 11:58:21 +10:30 · 2019-12-07 11:58:21 +10:30 · 0b34b92906
parent fd14956b60
commit 0b34b92906
3 changed files with 131 additions and 87 deletions
--- a/phasenn_train.py
+++ b/phasenn_train.py
@ -1,114 +1,158 @@
 #!/usr/bin/python3
 # phasenn_train.py
 #
-# David Rowe August 2019
+# David Rowe Dec 2019
 #
 # Train a NN to model phase from Codec 2 (sinusoidal model) amplitudes.
 #
 # Keras model for estimating the phase of sinusoidally modelled speech
 # To generate features:
 #   $ ./c2sim ~/Downloads/all_speech_8k.sw --dumpphase_nnl train.f32
 import numpy as np
 import sys
-from keras.layers import Dense
+import matplotlib.pyplot as plt
 from scipy import signal
 import codec2_model
 import argparse
 import os
 from keras.layers import Input, Dense, Concatenate
 from keras import models,layers
 from keras import initializers
 from keras import backend as K
-if len(sys.argv) < 2:
+# less verbose tensorflow ....
-    print("usage: phasenn_train.py train.f32")
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    sys.exit(0)
 # constants
-max_amp     = 80   # sparsevector covering 0 ... Fs/2
+N                 = 80      # number of time domain samples in frame
-nb_features = 243  # number of sparse features/row in input training data file
+width             = 256
-nb_epochs   = 10
+pairs             = 2*width
 Fs                = 8000
 nb_batch          = 32
 nb_plots          = 4
-# load training data
+def list_str(values):
    return values.split(',')
-feature_file = sys.argv[1]
+parser = argparse.ArgumentParser(description='Train a NN to model Codec 2 phases')
-features = np.fromfile(feature_file, dtype='float32')
+parser.add_argument('modelfile', help='Codec 2 model file with linear phase removed')
-nb_frames = int(len(features)/nb_features)
+parser.add_argument('--frames', type=list_str, default="30,31,32,33", help='Frames to view')
-print("nb_frames: %d" % (nb_frames))
+parser.add_argument('--epochs', type=int, default=100, help='Number of training epochs')
 args = parser.parse_args()
-# 0..80    log10(A)
+assert nb_plots == len(args.frames)
 # 81..161  cos(phi)
 # 162..242 sin(phi)
-features = np.reshape(features, (nb_frames, nb_features))
+# read in model file records
-print("features shape:")
+Wo, L, A, phase, voiced = codec2_model.read(args.modelfile)
-print(features.shape)
+nb_samples = Wo.size;
 print("nb_samples: %d" % (nb_samples))
-# So the idea is we can predict the next frames phases from the
+# set up sparse vectors, phase represented by cos(), sin() pairs
-# current frame, and the magnitude spectrum.  For voiced speech, the
+amp = np.zeros((nb_samples, width))
-# sinusoids are continuous, so can be predicted from frame to frame if
+phase_rect = np.zeros((nb_samples, pairs))
-# you know the frequency and previous phase.  We encode the frequency
+for i in range(nb_samples):
-# as the position in the sprase vector.
+    for m in range(1,L[i]+1):
-
+        bin = int(np.round(m*Wo[i]*width/np.pi)); bin = min(width-1, bin)
-# Cascased with that is phase spectra due to dispersion of the phase
+        amp[i,bin] = np.log10(A[i,m])
-# response of the vocal tract filter, e.g. a large dispersion around
+        phase_rect[i,2*bin]   = np.cos(phase[i,m])
-# resonances.  We supply the magnitude spectra to help model the vocal
+        phase_rect[i,2*bin+1] = np.sin(phase[i,m])
 # tract filter phase.
 # Unvoiced speech has more random phase.  Hopefully the NN can work
 # out if the speech is voiced or unvoiced from the magnitide spectra.
 # The phase is encoded using cos and sin of the phase, as these are
 # bounded by +/-1
 # So input features are this frame's log(A), and last frames phase.
 # The output features we are trying to model are this frames phase.
 train = np.concatenate( (features[1:,:max_amp+1], features[:-1,max_amp+1:]) )
 target = features([1:,max_amp+1:])
 # our model
 model = models.Sequential()
-model.add(layers.Dense(256, activation='relu', input_dim=nb_input))
+model.add(layers.Dense(pairs, activation='relu', input_dim=width))
-model.add(layers.Dense(256, activation='relu'))
+model.add(layers.Dense(4*pairs, activation='relu'))
-model.add(layers.Dense(256, activation='relu'))
+model.add(layers.Dense(pairs))
-model.add(layers.Dense(nb_ouput, activation='linear'))
+model.summary()
-# Custom loss function that measures difference in phase just at
+# custom loss function
-# non-zero elements of target (ytrue).  This could be extended to
+def sparse_loss(y_true, y_pred):
-# weight each phase error by the (log) Amplitude of each harmonic
+    mask = K.cast( K.not_equal(y_pred, 0), dtype='float32')
    n = K.sum(mask)
    return K.sum(K.square((y_pred - y_true)*mask))/n
-import keras.backend as K
+# testing custom loss function
-def customLoss(yTrue, yPred):
+x = Input(shape=(None,))
-    # generate a mask vector with 1's on non zero values of yTrue
+y = Input(shape=(None,))
-    mask = abs(K.sign(yTrue))
+loss_func = K.Function([x, y], [sparse_loss(x, y)])
-    # collect error in cos() and sin() terms, ignoring yPred values outside of
+assert loss_func([[[1,1,1]], [[0,2,0]]]) == np.array([1])
-    # harmonics we care about
+assert loss_func([[[0,1,0]], [[0,2,0]]]) == np.array([1])
    error = yTrue - mask * yPred
    return K.sum(error * error)
 # Compile our model 
 # fit the model
 from keras import optimizers
-model.compile(loss=customLoss, optimizer='sge')
+sgd = optimizers.SGD(lr=0.8, decay=1e-6, momentum=0.9, nesterov=True)
 model.compile(loss=sparse_loss, optimizer=sgd)
 history = model.fit(amp, phase_rect, batch_size=nb_batch, epochs=args.epochs)
 # fit model, using 20% of our data for validation
-history = model.fit(train, target, validation_split=0.2, batch_size=32, epochs=nb_epochs)
+# measure error in angle over all samples
 model.save("phasenn_model.h5")
-import matplotlib.pyplot as plt
+phase_rect_est = model.predict(amp)
 phase_est = np.zeros((nb_samples, width))
 used_bins = np.zeros((nb_samples, width), dtype=int)
 for i in range(nb_samples):
    for m in range(1,L[i]+1):
        bin = int(np.round(m*Wo[i]*width/np.pi)); bin = min(width-1, bin)
        phase_est[i,m] = np.angle(phase_rect_est[i,2*bin] + 1j*phase_rect_est[i,2*bin+1])
        used_bins[i,m] = 1
-plot_en = 0;
+ind = np.nonzero(used_bins)
-if plot_en:
+c1 = np.exp(1j*phase[ind]); c2 = np.exp(1j*phase_est[ind]);
-    plt.figure(1)
+err_angle = np.angle(c1 * np.conj(c2))       
-    plt.plot(10*np.sqrt(history.history['loss']))
+var = np.var(err_angle)
-    plt.plot(10*np.sqrt(history.history['val_loss']))
+std = np.std(err_angle)
-    plt.title('model loss')
+print("angle var: %4.2f std: %4.2f rads" % (var,std))
-    plt.ylabel('rms error (rad)')
+print("angle var: %4.2f std: %4.2f degs" % ((std*180/np.pi)**2,std*180/np.pi))
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper right')
    plt.show()
 # run model on training data and measure variance, should be similar to training "loss"
-train_out = model.predict(train)
+# synthesise time domain signal
-err = (train_out - target)
+def sample_time(r, phase):
-var = np.var(err)
+    s = np.zeros(2*N);
 std = np.std(err)
 print("var: %f std: %f" % (var,std))
    for m in range(1,L[r]+1):
        s = s + A[r,m]*np.cos(m*Wo[r]*range(-N,N) + phase[r,m])
    return s
 nb_plotsy = np.floor(np.sqrt(nb_plots)); nb_plotsx=nb_plots/nb_plotsy;
 frames = np.array(args.frames,dtype=int)
 plt.figure(1)
 plt.plot(history.history['loss'])
 plt.title('model loss')
 plt.xlabel('epoch')
 plt.show(block=False)
 plt.figure(2)
 plt.title('Amplitudes Spectra')
 for r in range(nb_plots):
    plt.subplot(nb_plotsy,nb_plotsx,r+1)
    f = frames[r];
    plt.plot(np.log10(A[f,1:L[f]]),'g')
    t = "frame %d" % (f)
    plt.title(t)
 plt.show(block=False)
 plt.figure(3)
 plt.title('Phase Spectra')
 for r in range(nb_plots):
    plt.subplot(nb_plotsy,nb_plotsx,r+1)
    f = frames[r]
    plt.plot(phase[f,1:L[f]]*180/np.pi,'g')        
    plt.plot(phase_est[f,1:L[f]]*180/np.pi,'r')        
    plt.ylim(-180,180)
    plt.legend(("phase","phase_est"))
 plt.show(block=False)
 plt.figure(4)
 plt.title('Time Domain')
 for r in range(nb_plots):
    plt.subplot(nb_plotsy,nb_plotsx,r+1)
    f = frames[r];
    s = sample_time(f, phase)
    s_est = sample_time(f, phase_est)
    plt.plot(range(-N,N),s,'g')
    plt.plot(range(-N,N),s_est,'r') 
    plt.legend(("s","s_est"))
 plt.show(block=False)
 print("Click on last figure to finish....")
 plt.waitforbuttonpress(0)
 plt.close()
--- a/run_n0_est.sh
+++ b/run_n0_est.sh
@ -12,4 +12,4 @@ x=$(basename $speech)
 base="${x%.*}"
 c2sim $speech --modelout - | est_n0 -r > $base'_nolinear.model'
-./plot_n0.py $base'_nolinear.model' --start 25
+./plot_n0.py $base'_nolinear.model'
--- a/train.sh
+++ b/train.sh
@ -4,9 +4,9 @@
 PATH=$PATH:~/codec2/build_linux/src:~/codec2/build_linux/misc
-speech=~/Downloads/all_8k.sw
+speech=~/Downloads/train_8k.sw
 x=$(basename $speech)
 base="${x%.*}"
-c2sim $speech --modelout - | est_n0 -r > $base'_nolinear.model'
+sox -t .sw -r 8000 -c 1 $speech -t .sw - trim 0 600 | c2sim - --modelout - | est_n0 -r > $base'_nolinear.model'
-./phasenn_train.py $base'_nolinear.model' --frames 560,655,990,2899
+./phasenn_train.py $base'_nolinear.model' --frames 1572,1908,6792,9600