mirror of https://github.com/drowe67/phasenn.git
115 lines
3.5 KiB
Python
Executable File
115 lines
3.5 KiB
Python
Executable File
#!/usr/bin/python3
|
|
# phasenn_train.py
|
|
#
|
|
# David Rowe August 2019
|
|
#
|
|
# Keras model for estimating the phase of sinusoidally modelled speech
|
|
|
|
# To generate features:
|
|
# $ ./c2sim ~/Downloads/all_speech_8k.sw --dumpphase_nnl train.f32
|
|
|
|
import numpy as np
|
|
import sys
|
|
from keras.layers import Dense
|
|
from keras import models,layers
|
|
from keras import initializers
|
|
|
|
if len(sys.argv) < 2:
|
|
print("usage: phasenn_train.py train.f32")
|
|
sys.exit(0)
|
|
|
|
# constants
|
|
|
|
max_amp = 80 # sparsevector covering 0 ... Fs/2
|
|
nb_features = 243 # number of sparse features/row in input training data file
|
|
nb_epochs = 10
|
|
|
|
# load training data
|
|
|
|
feature_file = sys.argv[1]
|
|
features = np.fromfile(feature_file, dtype='float32')
|
|
nb_frames = int(len(features)/nb_features)
|
|
print("nb_frames: %d" % (nb_frames))
|
|
|
|
# 0..80 log10(A)
|
|
# 81..161 cos(phi)
|
|
# 162..242 sin(phi)
|
|
|
|
features = np.reshape(features, (nb_frames, nb_features))
|
|
print("features shape:")
|
|
print(features.shape)
|
|
|
|
# So the idea is we can predict the next frames phases from the
|
|
# current frame, and the magnitude spectrum. For voiced speech, the
|
|
# sinusoids are continuous, so can be predicted from frame to frame if
|
|
# you know the frequency and previous phase. We encode the frequency
|
|
# as the position in the sprase vector.
|
|
|
|
# Cascased with that is phase spectra due to dispersion of the phase
|
|
# response of the vocal tract filter, e.g. a large dispersion around
|
|
# resonances. We supply the magnitude spectra to help model the vocal
|
|
# tract filter phase.
|
|
|
|
# Unvoiced speech has more random phase. Hopefully the NN can work
|
|
# out if the speech is voiced or unvoiced from the magnitide spectra.
|
|
|
|
# The phase is encoded using cos and sin of the phase, as these are
|
|
# bounded by +/-1
|
|
|
|
# So input features are this frame's log(A), and last frames phase.
|
|
# The output features we are trying to model are this frames phase.
|
|
|
|
train = np.concatenate( (features[1:,:max_amp+1], features[:-1,max_amp+1:]) )
|
|
target = features([1:,max_amp+1:])
|
|
|
|
model = models.Sequential()
|
|
model.add(layers.Dense(256, activation='relu', input_dim=nb_input))
|
|
model.add(layers.Dense(256, activation='relu'))
|
|
model.add(layers.Dense(256, activation='relu'))
|
|
model.add(layers.Dense(nb_ouput, activation='linear'))
|
|
|
|
# Custom loss function that measures difference in phase just at
|
|
# non-zero elements of target (ytrue). This could be extended to
|
|
# weight each phase error by the (log) Amplitude of each harmonic
|
|
|
|
import keras.backend as K
|
|
def customLoss(yTrue, yPred):
|
|
# generate a mask vector with 1's on non zero values of yTrue
|
|
mask = abs(K.sign(yTrue))
|
|
# collect error in cos() and sin() terms, ignoring yPred values outside of
|
|
# harmonics we care about
|
|
error = yTrue - mask * yPred
|
|
return K.sum(error * error)
|
|
|
|
# Compile our model
|
|
|
|
from keras import optimizers
|
|
model.compile(loss=customLoss, optimizer='sge')
|
|
|
|
# fit model, using 20% of our data for validation
|
|
|
|
history = model.fit(train, target, validation_split=0.2, batch_size=32, epochs=nb_epochs)
|
|
model.save("phasenn_model.h5")
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
plot_en = 0;
|
|
if plot_en:
|
|
plt.figure(1)
|
|
plt.plot(10*np.sqrt(history.history['loss']))
|
|
plt.plot(10*np.sqrt(history.history['val_loss']))
|
|
plt.title('model loss')
|
|
plt.ylabel('rms error (rad)')
|
|
plt.xlabel('epoch')
|
|
plt.legend(['train', 'valid'], loc='upper right')
|
|
plt.show()
|
|
|
|
# run model on training data and measure variance, should be similar to training "loss"
|
|
|
|
train_out = model.predict(train)
|
|
err = (train_out - target)
|
|
var = np.var(err)
|
|
std = np.std(err)
|
|
print("var: %f std: %f" % (var,std))
|
|
|