first pass phase training, no t overly impressed by results, but basic frameowkr doing sensible things, e.g. overtraining on small, all_8k database

master
David 2019-12-07 11:58:21 +10:30
parent fd14956b60
commit 0b34b92906
3 changed files with 131 additions and 87 deletions

View File

@ -1,114 +1,158 @@
#!/usr/bin/python3 #!/usr/bin/python3
# phasenn_train.py # phasenn_train.py
# #
# David Rowe August 2019 # David Rowe Dec 2019
#
# Train a NN to model phase from Codec 2 (sinusoidal model) amplitudes.
# #
# Keras model for estimating the phase of sinusoidally modelled speech
# To generate features:
# $ ./c2sim ~/Downloads/all_speech_8k.sw --dumpphase_nnl train.f32
import numpy as np import numpy as np
import sys import sys
from keras.layers import Dense import matplotlib.pyplot as plt
from scipy import signal
import codec2_model
import argparse
import os
from keras.layers import Input, Dense, Concatenate
from keras import models,layers from keras import models,layers
from keras import initializers from keras import initializers
from keras import backend as K
if len(sys.argv) < 2: # less verbose tensorflow ....
print("usage: phasenn_train.py train.f32") os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
sys.exit(0)
# constants # constants
max_amp = 80 # sparsevector covering 0 ... Fs/2 N = 80 # number of time domain samples in frame
nb_features = 243 # number of sparse features/row in input training data file width = 256
nb_epochs = 10 pairs = 2*width
Fs = 8000
nb_batch = 32
nb_plots = 4
# load training data def list_str(values):
return values.split(',')
feature_file = sys.argv[1] parser = argparse.ArgumentParser(description='Train a NN to model Codec 2 phases')
features = np.fromfile(feature_file, dtype='float32') parser.add_argument('modelfile', help='Codec 2 model file with linear phase removed')
nb_frames = int(len(features)/nb_features) parser.add_argument('--frames', type=list_str, default="30,31,32,33", help='Frames to view')
print("nb_frames: %d" % (nb_frames)) parser.add_argument('--epochs', type=int, default=100, help='Number of training epochs')
args = parser.parse_args()
# 0..80 log10(A) assert nb_plots == len(args.frames)
# 81..161 cos(phi)
# 162..242 sin(phi)
features = np.reshape(features, (nb_frames, nb_features)) # read in model file records
print("features shape:") Wo, L, A, phase, voiced = codec2_model.read(args.modelfile)
print(features.shape) nb_samples = Wo.size;
print("nb_samples: %d" % (nb_samples))
# So the idea is we can predict the next frames phases from the # set up sparse vectors, phase represented by cos(), sin() pairs
# current frame, and the magnitude spectrum. For voiced speech, the amp = np.zeros((nb_samples, width))
# sinusoids are continuous, so can be predicted from frame to frame if phase_rect = np.zeros((nb_samples, pairs))
# you know the frequency and previous phase. We encode the frequency for i in range(nb_samples):
# as the position in the sprase vector. for m in range(1,L[i]+1):
bin = int(np.round(m*Wo[i]*width/np.pi)); bin = min(width-1, bin)
# Cascased with that is phase spectra due to dispersion of the phase amp[i,bin] = np.log10(A[i,m])
# response of the vocal tract filter, e.g. a large dispersion around phase_rect[i,2*bin] = np.cos(phase[i,m])
# resonances. We supply the magnitude spectra to help model the vocal phase_rect[i,2*bin+1] = np.sin(phase[i,m])
# tract filter phase.
# Unvoiced speech has more random phase. Hopefully the NN can work
# out if the speech is voiced or unvoiced from the magnitide spectra.
# The phase is encoded using cos and sin of the phase, as these are
# bounded by +/-1
# So input features are this frame's log(A), and last frames phase.
# The output features we are trying to model are this frames phase.
train = np.concatenate( (features[1:,:max_amp+1], features[:-1,max_amp+1:]) )
target = features([1:,max_amp+1:])
# our model
model = models.Sequential() model = models.Sequential()
model.add(layers.Dense(256, activation='relu', input_dim=nb_input)) model.add(layers.Dense(pairs, activation='relu', input_dim=width))
model.add(layers.Dense(256, activation='relu')) model.add(layers.Dense(4*pairs, activation='relu'))
model.add(layers.Dense(256, activation='relu')) model.add(layers.Dense(pairs))
model.add(layers.Dense(nb_ouput, activation='linear')) model.summary()
# Custom loss function that measures difference in phase just at # custom loss function
# non-zero elements of target (ytrue). This could be extended to def sparse_loss(y_true, y_pred):
# weight each phase error by the (log) Amplitude of each harmonic mask = K.cast( K.not_equal(y_pred, 0), dtype='float32')
n = K.sum(mask)
return K.sum(K.square((y_pred - y_true)*mask))/n
import keras.backend as K # testing custom loss function
def customLoss(yTrue, yPred): x = Input(shape=(None,))
# generate a mask vector with 1's on non zero values of yTrue y = Input(shape=(None,))
mask = abs(K.sign(yTrue)) loss_func = K.Function([x, y], [sparse_loss(x, y)])
# collect error in cos() and sin() terms, ignoring yPred values outside of assert loss_func([[[1,1,1]], [[0,2,0]]]) == np.array([1])
# harmonics we care about assert loss_func([[[0,1,0]], [[0,2,0]]]) == np.array([1])
error = yTrue - mask * yPred
return K.sum(error * error)
# Compile our model
# fit the model
from keras import optimizers from keras import optimizers
model.compile(loss=customLoss, optimizer='sge') sgd = optimizers.SGD(lr=0.8, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss=sparse_loss, optimizer=sgd)
history = model.fit(amp, phase_rect, batch_size=nb_batch, epochs=args.epochs)
# fit model, using 20% of our data for validation
history = model.fit(train, target, validation_split=0.2, batch_size=32, epochs=nb_epochs) # measure error in angle over all samples
model.save("phasenn_model.h5")
import matplotlib.pyplot as plt phase_rect_est = model.predict(amp)
phase_est = np.zeros((nb_samples, width))
used_bins = np.zeros((nb_samples, width), dtype=int)
for i in range(nb_samples):
for m in range(1,L[i]+1):
bin = int(np.round(m*Wo[i]*width/np.pi)); bin = min(width-1, bin)
phase_est[i,m] = np.angle(phase_rect_est[i,2*bin] + 1j*phase_rect_est[i,2*bin+1])
used_bins[i,m] = 1
plot_en = 0; ind = np.nonzero(used_bins)
if plot_en: c1 = np.exp(1j*phase[ind]); c2 = np.exp(1j*phase_est[ind]);
plt.figure(1) err_angle = np.angle(c1 * np.conj(c2))
plt.plot(10*np.sqrt(history.history['loss'])) var = np.var(err_angle)
plt.plot(10*np.sqrt(history.history['val_loss'])) std = np.std(err_angle)
plt.title('model loss') print("angle var: %4.2f std: %4.2f rads" % (var,std))
plt.ylabel('rms error (rad)') print("angle var: %4.2f std: %4.2f degs" % ((std*180/np.pi)**2,std*180/np.pi))
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper right')
plt.show()
# run model on training data and measure variance, should be similar to training "loss"
train_out = model.predict(train) # synthesise time domain signal
err = (train_out - target) def sample_time(r, phase):
var = np.var(err) s = np.zeros(2*N);
std = np.std(err)
print("var: %f std: %f" % (var,std))
for m in range(1,L[r]+1):
s = s + A[r,m]*np.cos(m*Wo[r]*range(-N,N) + phase[r,m])
return s
nb_plotsy = np.floor(np.sqrt(nb_plots)); nb_plotsx=nb_plots/nb_plotsy;
frames = np.array(args.frames,dtype=int)
plt.figure(1)
plt.plot(history.history['loss'])
plt.title('model loss')
plt.xlabel('epoch')
plt.show(block=False)
plt.figure(2)
plt.title('Amplitudes Spectra')
for r in range(nb_plots):
plt.subplot(nb_plotsy,nb_plotsx,r+1)
f = frames[r];
plt.plot(np.log10(A[f,1:L[f]]),'g')
t = "frame %d" % (f)
plt.title(t)
plt.show(block=False)
plt.figure(3)
plt.title('Phase Spectra')
for r in range(nb_plots):
plt.subplot(nb_plotsy,nb_plotsx,r+1)
f = frames[r]
plt.plot(phase[f,1:L[f]]*180/np.pi,'g')
plt.plot(phase_est[f,1:L[f]]*180/np.pi,'r')
plt.ylim(-180,180)
plt.legend(("phase","phase_est"))
plt.show(block=False)
plt.figure(4)
plt.title('Time Domain')
for r in range(nb_plots):
plt.subplot(nb_plotsy,nb_plotsx,r+1)
f = frames[r];
s = sample_time(f, phase)
s_est = sample_time(f, phase_est)
plt.plot(range(-N,N),s,'g')
plt.plot(range(-N,N),s_est,'r')
plt.legend(("s","s_est"))
plt.show(block=False)
print("Click on last figure to finish....")
plt.waitforbuttonpress(0)
plt.close()

View File

@ -12,4 +12,4 @@ x=$(basename $speech)
base="${x%.*}" base="${x%.*}"
c2sim $speech --modelout - | est_n0 -r > $base'_nolinear.model' c2sim $speech --modelout - | est_n0 -r > $base'_nolinear.model'
./plot_n0.py $base'_nolinear.model' --start 25 ./plot_n0.py $base'_nolinear.model'

View File

@ -4,9 +4,9 @@
PATH=$PATH:~/codec2/build_linux/src:~/codec2/build_linux/misc PATH=$PATH:~/codec2/build_linux/src:~/codec2/build_linux/misc
speech=~/Downloads/all_8k.sw speech=~/Downloads/train_8k.sw
x=$(basename $speech) x=$(basename $speech)
base="${x%.*}" base="${x%.*}"
c2sim $speech --modelout - | est_n0 -r > $base'_nolinear.model' sox -t .sw -r 8000 -c 1 $speech -t .sw - trim 0 600 | c2sim - --modelout - | est_n0 -r > $base'_nolinear.model'
./phasenn_train.py $base'_nolinear.model' --frames 560,655,990,2899 ./phasenn_train.py $base'_nolinear.model' --frames 1572,1908,6792,9600