resolved merge on deep

pull/8/head
David 2019-07-27 11:48:14 +09:30
commit b8e18d096c
12 changed files with 103 additions and 78 deletions

View File

@ -19,7 +19,7 @@ install:
script:
# First build and install vanilla codec2 as we need -lcodec2 to build LPCNet
- git clone https://github.com/drowe67/codec2.git
- cd codec2 && git checkout brad-2020
- cd codec2
- mkdir build_linux && cd build_linux && cmake .. && make VERBOSE=1 codec2
# OK, build and test LPCNet
- cd $LPCNETDIR && mkdir -p $BUILDDIR && cd $BUILDDIR

View File

@ -6,10 +6,11 @@ Experimental version of LPCNet being developed for over the air Digital Voice ex
```
$ git clone https://github.com/drowe67/codec2.git
$ cd codec2 && mkdir build_linux && cd build_linux && cmake ../ && sudo make install
$ cd codec2 && mkdir build_linux && cd build_linux && cmake ../ && make
$ cd ~
$ git clone https://github.com/drowe67/LPCNet.git
$ cd LPCNet && mkdir build_linux && cd build_linux && cmake ..
$ cd LPCNet && mkdir build_linux && cd build_linux
$ cmake -DCODEC2_BUILD_DIR=~/codec2/build_linux ..
$ make
```
@ -22,7 +23,7 @@ $ sox ../../wav/wia.wav -t raw -r 16000 - | ./dump_data --c2pitch --test - - | .
LPCNet at 1733 bits/s using direct-split quantiser:
```
$ sox ../../wav/wia.wav -t raw -r 16000 - | ./lpcnet_enc -s | ./lpcnet_dec -s | aplay -f S16_LE -r 16000
sox ../../wav/wia.wav -t raw -r 16000 - | ./lpcnet_enc -s | ./lpcnet_dec -s | aplay -f S16_LE -r 16000
```
# Reading Further
@ -37,7 +38,7 @@ Thanks [Jean-Marc Valin](https://people.xiph.org/~jm/demo/lpcnet/) for making LP
# Cross Compiling for Windows
This code has been cross compiled to Windows using Fedora Linux 30, see the freedv-gui README.md, and build_winows.sh script.
This code has been cross compiled to Windows using Fedora Linux 30, see the freedv-gui README.md, and build_windows.sh script.
# Speech Material for Training
@ -48,14 +49,6 @@ cd 16k-LP7
sh /path/to/LPCNet/src/concat.sh
```
## Playing files on a remote machine
I use a server for training, but my laptop for listening:
```
david@laptop:~$ scp server:LPCNet/build_linux/speech_orig_16kb.raw /dev/stdout | aplay -f S16_LE -r 16000
```
# Quantiser Experiments
The quantiser files used for these experiments (pred_v2.tgz and split.tgz) are [here](http://rowetel.com/downloads/deep/lpcnet_quant)

View File

@ -92,10 +92,10 @@ CODEC2_PITCH *codec2_pitch_create(int *Sn_size, int *new_samples_each_call)
/* returns an estimate of the pitch period, input is a buffer of samples on length pitch->m */
int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing)
int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing, float *snr)
{
COMP Sw[FFT_ENC]; /* DFT of Sn[] */
float pitch_samples, snr;
float pitch_samples;
MODEL model;
*f0 = nlp(pitch->nlp_states, Sn, pitch->c2const.n_samp, &pitch_samples, Sw, pitch->W, &pitch->prev_f0);
@ -103,9 +103,9 @@ int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing)
dft_speech(&pitch->c2const, pitch->fft_fwd_cfg, Sw, Sn, pitch->w);
two_stage_pitch_refinement(&pitch->c2const, &model, Sw);
estimate_amplitudes(&model, Sw, pitch->W, 1);
snr = est_voicing_mbe(&pitch->c2const, &model, Sw, pitch->W);
*snr = est_voicing_mbe(&pitch->c2const, &model, Sw, pitch->W);
*voicing = 1.0 - 2.0/pow(10.0, snr/10.0);
*voicing = 1.0 - 2.0/pow(10.0, *snr/10.0);
if (*voicing < 0.0) *voicing = 0.0;
return (int)2*pitch_samples;
}

View File

@ -13,7 +13,7 @@
typedef struct CODEC2_PITCH_S CODEC2_PITCH;
CODEC2_PITCH *codec2_pitch_create(int *Sn_size, int *new_samples_each_call);
int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing);
int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing, float *snr);
void codec2_pitch_destroy(CODEC2_PITCH *pitch);
#endif

View File

@ -293,6 +293,7 @@ int main(int argc, char **argv) {
float noise_std=0;
int training = -1;
int c2pitch_en = 0;
int c2voicing_en = 0;
int nvec = 5000000;
float delta_f0 = 0.0;
int fuzz = 1;
@ -321,6 +322,7 @@ int main(int argc, char **argv) {
{"train", no_argument, 0, 'r'},
{"short", required_argument, 0, 's'},
{"test", no_argument, 0, 't'},
{"c2voicing", no_argument, 0, 'v'},
{"fuzz", required_argument, 0, 'z'},
{0, 0, 0, 0}
};
@ -331,6 +333,9 @@ int main(int argc, char **argv) {
case 'c':
c2pitch_en = 1;
break;
case 'v':
c2voicing_en = 1;
break;
case 'f':
dump_fft = 1;
f_fft = fopen(optarg, "wb");
@ -408,6 +413,7 @@ int main(int argc, char **argv) {
fprintf(stderr, " -z --fuzz fuzz freq response and gain during training (default on)\n");
fprintf(stderr, " -f --dumpfft FileName dump a file of fft log energy samples\n");
fprintf(stderr, " -s --short FileName dump (ulaw) pcm file in 16 bit short format as well\n");
fprintf(stderr, " -c --c2voicing Codec 2 voicing estimator\n");
exit(1);
}
@ -514,15 +520,15 @@ int main(int argc, char **argv) {
c2_Sn[i] = c2_Sn[i+c2_frame_size];
for(i=0; i<c2_frame_size; i++)
c2_Sn[i+c2_Sn_size-c2_frame_size] = x[i];
float f0, voicing; int pitch_index;
pitch_index = codec2_pitch_est(c2pitch, c2_Sn, &f0, &voicing);
if (pitch_index >= 2*PITCH_MAX_PERIOD) pitch_index = 2*PITCH_MAX_PERIOD-1;
if (pitch_index < 2*PITCH_MIN_PERIOD) pitch_index = 2*PITCH_MIN_PERIOD;
assert(pitch_index < 2*PITCH_MAX_PERIOD);
assert(pitch_index >= 2*PITCH_MIN_PERIOD);
float f0, voicing, snr; int pitch_index;
pitch_index = codec2_pitch_est(c2pitch, c2_Sn, &f0, &voicing, &snr);
if (pitch_index >= 2*PITCH_MAX_PERIOD) pitch_index = 2*PITCH_MAX_PERIOD-1;
if (pitch_index < 2*PITCH_MIN_PERIOD) pitch_index = 2*PITCH_MIN_PERIOD;
assert(pitch_index < 2*PITCH_MAX_PERIOD);
assert(pitch_index >= 2*PITCH_MIN_PERIOD);
features[2*NB_BANDS] = 0.01*(pitch_index-200);
// Tried using Codec 2 voicing est but poor results
// features[2*NB_BANDS+1] = voicing;
if (c2voicing_en) features[2*NB_BANDS+1] = voicing;
//int pitch_index_lpcnet = 100*features[2*NB_BANDS] + 200;
//fprintf(stderr, "%f %d %d v: %f %f\n", f0, pitch_index, pitch_index, features[2*NB_BANDS+1], voicing);
}

View File

@ -65,8 +65,6 @@ void compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
int j;
int band_size;
band_size = (eband5ms[i+1]-eband5ms[i])*WINDOW_SIZE_5MS;
//fprintf(stderr, "i: %d band_size: %d eband5ms[i]*WINDOW_SIZE_5MS: %d \n", i, band_size, eband5ms[i]*WINDOW_SIZE_5MS);
// frac implements a triangular window of FFT energy
for (j=0;j<band_size;j++) {
float tmp;
float frac = (float)j/band_size;
@ -74,7 +72,6 @@ void compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
tmp += SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].i);
sum[i] += (1-frac)*tmp;
sum[i+1] += frac*tmp;
//fprintf(stderr, "i: %d j: %d frac: %4.3f\n", i, j, frac);
}
}
sum[0] *= 2;

View File

@ -154,10 +154,11 @@ void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features
run_frame_network(lpcnet, condition, gru_a_condition, features, pitch);
memcpy(lpc, lpcnet->old_lpc[FEATURES_DELAY-1], LPC_ORDER*sizeof(lpc[0]));
memmove(lpcnet->old_lpc[1], lpcnet->old_lpc[0], (FEATURES_DELAY-1)*LPC_ORDER*sizeof(lpc[0]));
if (logmag) {
float tmp[NB_BANDS];
for (i=0;i<NB_BANDS;i++) tmp[i] = pow(10.f, features[i]);
lpc_from_bands(lpcnet->old_lpc[0], tmp);
float tmp[NB_BANDS];
for (i=0;i<NB_BANDS;i++) tmp[i] = pow(10.f, features[i]);
lpc_from_bands(lpcnet->old_lpc[0], tmp);
}
else
lpc_from_cepstrum(lpcnet->old_lpc[0], features);
@ -204,6 +205,7 @@ void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features
pred_ulaw = lin2ulaw(pred);
run_sample_network(&lpcnet->nnet, pdf, condition, gru_a_condition, lpcnet->last_exc, last_sig_ulaw, pred_ulaw);
exc = sample_from_pdf(pdf, DUAL_FC_OUT_SIZE, MAX16(0, 1.5f*pitch_gain - .5f), PDF_FLOOR);
//exc = sample_from_pdf(pdf, DUAL_FC_OUT_SIZE, 0.5*pitch_gain-0.5, PDF_FLOOR);
pcm = pred + ulaw2lin(exc);
RNN_MOVE(&lpcnet->last_sig[1], &lpcnet->last_sig[0], LPC_ORDER-1);
lpcnet->last_sig[0] = pcm;

View File

@ -207,6 +207,7 @@ LPCNET_DUMP *lpcnet_dump_create(void) {
assert(d->c2_Sn != NULL);
for(i=0; i<d->c2_Sn_size; i++) d->c2_Sn[i] = 0.0;
d->c2voicing = 0;
assert(LPCNET_NB_FEATURES == NB_FEATURES);
return d;
}
@ -231,7 +232,7 @@ void lpcnet_dump(LPCNET_DUMP *d, float x[], float features[])
for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
compute_frame_features(d->st, X, Ex, features, x);
/* inject pitch from Codec 2 pitch estimator */
/* inject pitch and (optionally) voicing from Codec 2 pitch estimator */
int c2_Sn_size = d->c2_Sn_size;
int c2_frame_size = d->c2_frame_size;
@ -240,10 +241,15 @@ void lpcnet_dump(LPCNET_DUMP *d, float x[], float features[])
c2_Sn[i] = c2_Sn[i+c2_frame_size];
for(i=0; i<c2_frame_size; i++)
c2_Sn[i+c2_Sn_size-c2_frame_size] = x[i];
float f0, voicing; int pitch_index;
pitch_index = codec2_pitch_est(d->c2pitch, c2_Sn, &f0, &voicing);
float f0, voicing, snr; int pitch_index;
pitch_index = codec2_pitch_est(d->c2pitch, c2_Sn, &f0, &voicing, &snr);
if (pitch_index >= 2*PITCH_MAX_PERIOD) pitch_index = 2*PITCH_MAX_PERIOD-1;
if (pitch_index < 2*PITCH_MIN_PERIOD) pitch_index = 2*PITCH_MIN_PERIOD;
features[2*NB_BANDS] = 0.01*(pitch_index-200);
if (d->c2voicing)
features[2*NB_BANDS+1] = voicing;
}

View File

@ -50,6 +50,7 @@ typedef struct {
CODEC2_PITCH *c2pitch;
int c2_Sn_size, c2_frame_size;
float *c2_Sn;
int c2voicing;
} LPCNET_DUMP;
LPCNET_DUMP *lpcnet_dump_create(void);

View File

@ -3,11 +3,63 @@
%
% Octave script to plot features against time
nb_lpcnet_features=55;
nb_lpcnet_bands=18;
Fs = 16000;
Fsf = 100;
Fssv = 50;
function plot_features(fn_speech, fn_feat, fn_feat2)
nb_lpcnet_features=55;
nb_lpcnet_bands=18;
Fs = 16000;
Fsf = 100;
Fssv = 50;
st_sec=1; en_sec=2;
feat_lpcnet=load_f32(fn_feat, nb_lpcnet_features);
dctLy = feat_lpcnet(:,1:nb_lpcnet_bands);
dctLydB = 10*dctLy;
pitch_index_lpcnet = 100*feat_lpcnet(:,2*nb_lpcnet_bands+1) + 200;
f0 = 2*Fs ./ pitch_index_lpcnet;
gain = feat_lpcnet(:,2*nb_lpcnet_bands+2);
%snr = feat_lpcnet(:,2*nb_lpcnet_bands+3);
voicing = feat_lpcnet(:,2*nb_lpcnet_bands+3);
fs=fopen(fn_speech,"rb");
s = fread(fs,Inf,"short");
fclose(fs);
figure(1); clf;
subplot(311);
plot_against_time(s, st_sec, en_sec, Fs, 'b;speech;')
subplot(312);
plot_against_time(f0, st_sec, en_sec, Fsf, 'b;f0;');
%plot_against_time(snr, st_sec, en_sec, Fsf, 'g;snr;');
subplot(313);
plot_against_time(gain, st_sec, en_sec, Fsf, 'b;gain;');
hold on;
plot_against_time(voicing, st_sec, en_sec, Fsf, 'g;voicing;');
%plot_against_time(snr, st_sec, en_sec, Fsf, 'g;snr;');
%axis([st_sec en_sec 0 10])
hold off;
LydB = idct(dctLydB')';
figure(2); clf;
mesh_against_time(LydB, st_sec, en_sec, Fsf); title('LydB');
figure(3); clf;
bar(mean(LydB)); title ('mean LydB'); axis([1 18 0 25]);
figure(4); clf;
subplot(211); hist(gain,0:0.1:1); title ('gain');
if nargin == 3
feat_lpcnet2=load_f32(fn_feat2, nb_lpcnet_features);
dctLy2 = feat_lpcnet2(:,1:nb_lpcnet_bands);
dctLydB2 = 10*dctLy2;
LydB2 = idct(dctLydB2')';
figure(5); clf;
bar(mean(LydB2)); title ('mean LydB2'); axis([1 18 0 25]);
figure(4); subplot(212);
gain2 = feat_lpcnet2(:,2*nb_lpcnet_bands+2);
hist(gain2,0:0.1:1);
end
endfunction
function plot_against_time(v, st_sec, en_sec, Fs, leg)
st = Fs*st_sec; en = Fs*en_sec;
@ -21,34 +73,3 @@ function mesh_against_time(m, st_sec, en_sec, Fs)
mesh(m(st+1:en+1,:));
endfunction
feat_lpcnet=load_f32("../birch.f32", nb_lpcnet_features);
dctLy = feat_lpcnet(:,1:nb_lpcnet_bands);
dctLydB = 10*dctLy;
fs=fopen("../speech_orig_16k_centre.s16","rb");
s = fread(fs,Inf,"short");
fclose(fs);
st_sec=0; en_sec=1;
figure(1); clf;
subplot(211);
plot_against_time(s, st_sec, en_sec, Fs, 'b')
subplot(212);
plot_against_time(dctLydB(:,1), st_sec, en_sec, Fsf, 'r');
figure(2); clf;
mesh_against_time(dctLydB(:,1:5), st_sec, en_sec, Fsf);
LydB = idct(dctLydB')';
figure(3); clf;
mesh_against_time(LydB, st_sec, en_sec, Fsf);
dc = mean(LydB')';
figure(4);
subplot(211);
plot_against_time(dctLydB(:,1), st_sec, en_sec, Fsf, 'r');
subplot(212);
plot_against_time(dc, st_sec, en_sec, Fsf, 'r');
printf("mean dctLydB(:,1): %f mean dc: %f\n", mean(dctLydB(:,1)), mean(dc));

View File

@ -34,7 +34,7 @@ int main(int argc, char *argv[])
float Sn[Sn_size]; /* float buffer of input speech samples */
FILE *fin,*fout;
int pitch_samples;
float f0, voicing;
float f0, voicing, snr;
int i;
/* Input file */
@ -63,7 +63,7 @@ int main(int argc, char *argv[])
Sn[i] = Sn[i+new_samples_each_call];
for(i=0; i<new_samples_each_call; i++)
Sn[i+Sn_size-new_samples_each_call] = buf[i];
pitch_samples = codec2_pitch_est(c2_pitch, Sn, &f0, &voicing);
pitch_samples = codec2_pitch_est(c2_pitch, Sn, &f0, &voicing, &snr);
fprintf(fout,"%f %d\n", f0, pitch_samples);
}

View File

@ -40,10 +40,10 @@ int main(int argc, char **argv) {
int o = 0;
int opt_idx = 0;
while( o != -1 ) {
static struct option long_opts[] = {
{"mag", no_argument,0, 'i'},
{0, 0, 0, 0}
};
static struct option long_opts[] = {
{"mag", no_argument,0, 'i'},
{0, 0, 0, 0}
};
o = getopt_long(argc,argv,"ih",long_opts,&opt_idx);
@ -85,7 +85,6 @@ int main(int argc, char **argv) {
net = lpcnet_create();
lpcnet_open_test_file(net, "test_lpcnet_states.f32");
while (1) {
float in_features[NB_TOTAL_FEATURES];
float features[NB_FEATURES];