mirror of https://github.com/drowe67/phasenn.git
first pass phase training, no t overly impressed by results, but basic frameowkr doing sensible things, e.g. overtraining on small, all_8k database
parent
fd14956b60
commit
0b34b92906
210
phasenn_train.py
210
phasenn_train.py
|
@ -1,114 +1,158 @@
|
||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
# phasenn_train.py
|
# phasenn_train.py
|
||||||
#
|
#
|
||||||
# David Rowe August 2019
|
# David Rowe Dec 2019
|
||||||
|
#
|
||||||
|
# Train a NN to model phase from Codec 2 (sinusoidal model) amplitudes.
|
||||||
#
|
#
|
||||||
# Keras model for estimating the phase of sinusoidally modelled speech
|
|
||||||
|
|
||||||
# To generate features:
|
|
||||||
# $ ./c2sim ~/Downloads/all_speech_8k.sw --dumpphase_nnl train.f32
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import sys
|
import sys
|
||||||
from keras.layers import Dense
|
import matplotlib.pyplot as plt
|
||||||
|
from scipy import signal
|
||||||
|
import codec2_model
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
from keras.layers import Input, Dense, Concatenate
|
||||||
from keras import models,layers
|
from keras import models,layers
|
||||||
from keras import initializers
|
from keras import initializers
|
||||||
|
from keras import backend as K
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
# less verbose tensorflow ....
|
||||||
print("usage: phasenn_train.py train.f32")
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
# constants
|
# constants
|
||||||
|
|
||||||
max_amp = 80 # sparsevector covering 0 ... Fs/2
|
N = 80 # number of time domain samples in frame
|
||||||
nb_features = 243 # number of sparse features/row in input training data file
|
width = 256
|
||||||
nb_epochs = 10
|
pairs = 2*width
|
||||||
|
Fs = 8000
|
||||||
|
nb_batch = 32
|
||||||
|
nb_plots = 4
|
||||||
|
|
||||||
# load training data
|
def list_str(values):
|
||||||
|
return values.split(',')
|
||||||
|
|
||||||
feature_file = sys.argv[1]
|
parser = argparse.ArgumentParser(description='Train a NN to model Codec 2 phases')
|
||||||
features = np.fromfile(feature_file, dtype='float32')
|
parser.add_argument('modelfile', help='Codec 2 model file with linear phase removed')
|
||||||
nb_frames = int(len(features)/nb_features)
|
parser.add_argument('--frames', type=list_str, default="30,31,32,33", help='Frames to view')
|
||||||
print("nb_frames: %d" % (nb_frames))
|
parser.add_argument('--epochs', type=int, default=100, help='Number of training epochs')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
# 0..80 log10(A)
|
assert nb_plots == len(args.frames)
|
||||||
# 81..161 cos(phi)
|
|
||||||
# 162..242 sin(phi)
|
|
||||||
|
|
||||||
features = np.reshape(features, (nb_frames, nb_features))
|
# read in model file records
|
||||||
print("features shape:")
|
Wo, L, A, phase, voiced = codec2_model.read(args.modelfile)
|
||||||
print(features.shape)
|
nb_samples = Wo.size;
|
||||||
|
print("nb_samples: %d" % (nb_samples))
|
||||||
|
|
||||||
# So the idea is we can predict the next frames phases from the
|
# set up sparse vectors, phase represented by cos(), sin() pairs
|
||||||
# current frame, and the magnitude spectrum. For voiced speech, the
|
amp = np.zeros((nb_samples, width))
|
||||||
# sinusoids are continuous, so can be predicted from frame to frame if
|
phase_rect = np.zeros((nb_samples, pairs))
|
||||||
# you know the frequency and previous phase. We encode the frequency
|
for i in range(nb_samples):
|
||||||
# as the position in the sprase vector.
|
for m in range(1,L[i]+1):
|
||||||
|
bin = int(np.round(m*Wo[i]*width/np.pi)); bin = min(width-1, bin)
|
||||||
# Cascased with that is phase spectra due to dispersion of the phase
|
amp[i,bin] = np.log10(A[i,m])
|
||||||
# response of the vocal tract filter, e.g. a large dispersion around
|
phase_rect[i,2*bin] = np.cos(phase[i,m])
|
||||||
# resonances. We supply the magnitude spectra to help model the vocal
|
phase_rect[i,2*bin+1] = np.sin(phase[i,m])
|
||||||
# tract filter phase.
|
|
||||||
|
|
||||||
# Unvoiced speech has more random phase. Hopefully the NN can work
|
|
||||||
# out if the speech is voiced or unvoiced from the magnitide spectra.
|
|
||||||
|
|
||||||
# The phase is encoded using cos and sin of the phase, as these are
|
|
||||||
# bounded by +/-1
|
|
||||||
|
|
||||||
# So input features are this frame's log(A), and last frames phase.
|
|
||||||
# The output features we are trying to model are this frames phase.
|
|
||||||
|
|
||||||
train = np.concatenate( (features[1:,:max_amp+1], features[:-1,max_amp+1:]) )
|
|
||||||
target = features([1:,max_amp+1:])
|
|
||||||
|
|
||||||
|
# our model
|
||||||
model = models.Sequential()
|
model = models.Sequential()
|
||||||
model.add(layers.Dense(256, activation='relu', input_dim=nb_input))
|
model.add(layers.Dense(pairs, activation='relu', input_dim=width))
|
||||||
model.add(layers.Dense(256, activation='relu'))
|
model.add(layers.Dense(4*pairs, activation='relu'))
|
||||||
model.add(layers.Dense(256, activation='relu'))
|
model.add(layers.Dense(pairs))
|
||||||
model.add(layers.Dense(nb_ouput, activation='linear'))
|
model.summary()
|
||||||
|
|
||||||
# Custom loss function that measures difference in phase just at
|
# custom loss function
|
||||||
# non-zero elements of target (ytrue). This could be extended to
|
def sparse_loss(y_true, y_pred):
|
||||||
# weight each phase error by the (log) Amplitude of each harmonic
|
mask = K.cast( K.not_equal(y_pred, 0), dtype='float32')
|
||||||
|
n = K.sum(mask)
|
||||||
|
return K.sum(K.square((y_pred - y_true)*mask))/n
|
||||||
|
|
||||||
import keras.backend as K
|
# testing custom loss function
|
||||||
def customLoss(yTrue, yPred):
|
x = Input(shape=(None,))
|
||||||
# generate a mask vector with 1's on non zero values of yTrue
|
y = Input(shape=(None,))
|
||||||
mask = abs(K.sign(yTrue))
|
loss_func = K.Function([x, y], [sparse_loss(x, y)])
|
||||||
# collect error in cos() and sin() terms, ignoring yPred values outside of
|
assert loss_func([[[1,1,1]], [[0,2,0]]]) == np.array([1])
|
||||||
# harmonics we care about
|
assert loss_func([[[0,1,0]], [[0,2,0]]]) == np.array([1])
|
||||||
error = yTrue - mask * yPred
|
|
||||||
return K.sum(error * error)
|
|
||||||
|
|
||||||
# Compile our model
|
|
||||||
|
|
||||||
|
# fit the model
|
||||||
from keras import optimizers
|
from keras import optimizers
|
||||||
model.compile(loss=customLoss, optimizer='sge')
|
sgd = optimizers.SGD(lr=0.8, decay=1e-6, momentum=0.9, nesterov=True)
|
||||||
|
model.compile(loss=sparse_loss, optimizer=sgd)
|
||||||
|
history = model.fit(amp, phase_rect, batch_size=nb_batch, epochs=args.epochs)
|
||||||
|
|
||||||
# fit model, using 20% of our data for validation
|
|
||||||
|
|
||||||
history = model.fit(train, target, validation_split=0.2, batch_size=32, epochs=nb_epochs)
|
# measure error in angle over all samples
|
||||||
model.save("phasenn_model.h5")
|
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
phase_rect_est = model.predict(amp)
|
||||||
|
phase_est = np.zeros((nb_samples, width))
|
||||||
|
used_bins = np.zeros((nb_samples, width), dtype=int)
|
||||||
|
for i in range(nb_samples):
|
||||||
|
for m in range(1,L[i]+1):
|
||||||
|
bin = int(np.round(m*Wo[i]*width/np.pi)); bin = min(width-1, bin)
|
||||||
|
phase_est[i,m] = np.angle(phase_rect_est[i,2*bin] + 1j*phase_rect_est[i,2*bin+1])
|
||||||
|
used_bins[i,m] = 1
|
||||||
|
|
||||||
plot_en = 0;
|
ind = np.nonzero(used_bins)
|
||||||
if plot_en:
|
c1 = np.exp(1j*phase[ind]); c2 = np.exp(1j*phase_est[ind]);
|
||||||
plt.figure(1)
|
err_angle = np.angle(c1 * np.conj(c2))
|
||||||
plt.plot(10*np.sqrt(history.history['loss']))
|
var = np.var(err_angle)
|
||||||
plt.plot(10*np.sqrt(history.history['val_loss']))
|
std = np.std(err_angle)
|
||||||
plt.title('model loss')
|
print("angle var: %4.2f std: %4.2f rads" % (var,std))
|
||||||
plt.ylabel('rms error (rad)')
|
print("angle var: %4.2f std: %4.2f degs" % ((std*180/np.pi)**2,std*180/np.pi))
|
||||||
plt.xlabel('epoch')
|
|
||||||
plt.legend(['train', 'valid'], loc='upper right')
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
# run model on training data and measure variance, should be similar to training "loss"
|
|
||||||
|
|
||||||
train_out = model.predict(train)
|
# synthesise time domain signal
|
||||||
err = (train_out - target)
|
def sample_time(r, phase):
|
||||||
var = np.var(err)
|
s = np.zeros(2*N);
|
||||||
std = np.std(err)
|
|
||||||
print("var: %f std: %f" % (var,std))
|
|
||||||
|
|
||||||
|
for m in range(1,L[r]+1):
|
||||||
|
s = s + A[r,m]*np.cos(m*Wo[r]*range(-N,N) + phase[r,m])
|
||||||
|
return s
|
||||||
|
|
||||||
|
nb_plotsy = np.floor(np.sqrt(nb_plots)); nb_plotsx=nb_plots/nb_plotsy;
|
||||||
|
frames = np.array(args.frames,dtype=int)
|
||||||
|
|
||||||
|
plt.figure(1)
|
||||||
|
plt.plot(history.history['loss'])
|
||||||
|
plt.title('model loss')
|
||||||
|
plt.xlabel('epoch')
|
||||||
|
plt.show(block=False)
|
||||||
|
|
||||||
|
plt.figure(2)
|
||||||
|
plt.title('Amplitudes Spectra')
|
||||||
|
for r in range(nb_plots):
|
||||||
|
plt.subplot(nb_plotsy,nb_plotsx,r+1)
|
||||||
|
f = frames[r];
|
||||||
|
plt.plot(np.log10(A[f,1:L[f]]),'g')
|
||||||
|
t = "frame %d" % (f)
|
||||||
|
plt.title(t)
|
||||||
|
plt.show(block=False)
|
||||||
|
|
||||||
|
plt.figure(3)
|
||||||
|
plt.title('Phase Spectra')
|
||||||
|
for r in range(nb_plots):
|
||||||
|
plt.subplot(nb_plotsy,nb_plotsx,r+1)
|
||||||
|
f = frames[r]
|
||||||
|
plt.plot(phase[f,1:L[f]]*180/np.pi,'g')
|
||||||
|
plt.plot(phase_est[f,1:L[f]]*180/np.pi,'r')
|
||||||
|
plt.ylim(-180,180)
|
||||||
|
plt.legend(("phase","phase_est"))
|
||||||
|
plt.show(block=False)
|
||||||
|
|
||||||
|
plt.figure(4)
|
||||||
|
plt.title('Time Domain')
|
||||||
|
for r in range(nb_plots):
|
||||||
|
plt.subplot(nb_plotsy,nb_plotsx,r+1)
|
||||||
|
f = frames[r];
|
||||||
|
s = sample_time(f, phase)
|
||||||
|
s_est = sample_time(f, phase_est)
|
||||||
|
plt.plot(range(-N,N),s,'g')
|
||||||
|
plt.plot(range(-N,N),s_est,'r')
|
||||||
|
plt.legend(("s","s_est"))
|
||||||
|
plt.show(block=False)
|
||||||
|
|
||||||
|
print("Click on last figure to finish....")
|
||||||
|
plt.waitforbuttonpress(0)
|
||||||
|
plt.close()
|
||||||
|
|
|
@ -12,4 +12,4 @@ x=$(basename $speech)
|
||||||
base="${x%.*}"
|
base="${x%.*}"
|
||||||
|
|
||||||
c2sim $speech --modelout - | est_n0 -r > $base'_nolinear.model'
|
c2sim $speech --modelout - | est_n0 -r > $base'_nolinear.model'
|
||||||
./plot_n0.py $base'_nolinear.model' --start 25
|
./plot_n0.py $base'_nolinear.model'
|
||||||
|
|
6
train.sh
6
train.sh
|
@ -4,9 +4,9 @@
|
||||||
|
|
||||||
PATH=$PATH:~/codec2/build_linux/src:~/codec2/build_linux/misc
|
PATH=$PATH:~/codec2/build_linux/src:~/codec2/build_linux/misc
|
||||||
|
|
||||||
speech=~/Downloads/all_8k.sw
|
speech=~/Downloads/train_8k.sw
|
||||||
x=$(basename $speech)
|
x=$(basename $speech)
|
||||||
base="${x%.*}"
|
base="${x%.*}"
|
||||||
|
|
||||||
c2sim $speech --modelout - | est_n0 -r > $base'_nolinear.model'
|
sox -t .sw -r 8000 -c 1 $speech -t .sw - trim 0 600 | c2sim - --modelout - | est_n0 -r > $base'_nolinear.model'
|
||||||
./phasenn_train.py $base'_nolinear.model' --frames 560,655,990,2899
|
./phasenn_train.py $base'_nolinear.model' --frames 1572,1908,6792,9600
|
||||||
|
|
Loading…
Reference in New Issue