mirror of https://github.com/drowe67/LPCNet.git
resolved merge on deep
commit
b8e18d096c
|
@ -19,7 +19,7 @@ install:
|
|||
script:
|
||||
# First build and install vanilla codec2 as we need -lcodec2 to build LPCNet
|
||||
- git clone https://github.com/drowe67/codec2.git
|
||||
- cd codec2 && git checkout brad-2020
|
||||
- cd codec2
|
||||
- mkdir build_linux && cd build_linux && cmake .. && make VERBOSE=1 codec2
|
||||
# OK, build and test LPCNet
|
||||
- cd $LPCNETDIR && mkdir -p $BUILDDIR && cd $BUILDDIR
|
||||
|
|
17
README.md
17
README.md
|
@ -6,10 +6,11 @@ Experimental version of LPCNet being developed for over the air Digital Voice ex
|
|||
|
||||
```
|
||||
$ git clone https://github.com/drowe67/codec2.git
|
||||
$ cd codec2 && mkdir build_linux && cd build_linux && cmake ../ && sudo make install
|
||||
$ cd codec2 && mkdir build_linux && cd build_linux && cmake ../ && make
|
||||
$ cd ~
|
||||
$ git clone https://github.com/drowe67/LPCNet.git
|
||||
$ cd LPCNet && mkdir build_linux && cd build_linux && cmake ..
|
||||
$ cd LPCNet && mkdir build_linux && cd build_linux
|
||||
$ cmake -DCODEC2_BUILD_DIR=~/codec2/build_linux ..
|
||||
$ make
|
||||
```
|
||||
|
||||
|
@ -22,7 +23,7 @@ $ sox ../../wav/wia.wav -t raw -r 16000 - | ./dump_data --c2pitch --test - - | .
|
|||
|
||||
LPCNet at 1733 bits/s using direct-split quantiser:
|
||||
```
|
||||
$ sox ../../wav/wia.wav -t raw -r 16000 - | ./lpcnet_enc -s | ./lpcnet_dec -s | aplay -f S16_LE -r 16000
|
||||
sox ../../wav/wia.wav -t raw -r 16000 - | ./lpcnet_enc -s | ./lpcnet_dec -s | aplay -f S16_LE -r 16000
|
||||
```
|
||||
|
||||
# Reading Further
|
||||
|
@ -37,7 +38,7 @@ Thanks [Jean-Marc Valin](https://people.xiph.org/~jm/demo/lpcnet/) for making LP
|
|||
|
||||
# Cross Compiling for Windows
|
||||
|
||||
This code has been cross compiled to Windows using Fedora Linux 30, see the freedv-gui README.md, and build_winows.sh script.
|
||||
This code has been cross compiled to Windows using Fedora Linux 30, see the freedv-gui README.md, and build_windows.sh script.
|
||||
|
||||
# Speech Material for Training
|
||||
|
||||
|
@ -48,14 +49,6 @@ cd 16k-LP7
|
|||
sh /path/to/LPCNet/src/concat.sh
|
||||
```
|
||||
|
||||
## Playing files on a remote machine
|
||||
|
||||
I use a server for training, but my laptop for listening:
|
||||
|
||||
```
|
||||
david@laptop:~$ scp server:LPCNet/build_linux/speech_orig_16kb.raw /dev/stdout | aplay -f S16_LE -r 16000
|
||||
```
|
||||
|
||||
# Quantiser Experiments
|
||||
|
||||
The quantiser files used for these experiments (pred_v2.tgz and split.tgz) are [here](http://rowetel.com/downloads/deep/lpcnet_quant)
|
||||
|
|
|
@ -92,10 +92,10 @@ CODEC2_PITCH *codec2_pitch_create(int *Sn_size, int *new_samples_each_call)
|
|||
|
||||
/* returns an estimate of the pitch period, input is a buffer of samples on length pitch->m */
|
||||
|
||||
int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing)
|
||||
int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing, float *snr)
|
||||
{
|
||||
COMP Sw[FFT_ENC]; /* DFT of Sn[] */
|
||||
float pitch_samples, snr;
|
||||
float pitch_samples;
|
||||
MODEL model;
|
||||
|
||||
*f0 = nlp(pitch->nlp_states, Sn, pitch->c2const.n_samp, &pitch_samples, Sw, pitch->W, &pitch->prev_f0);
|
||||
|
@ -103,9 +103,9 @@ int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing)
|
|||
dft_speech(&pitch->c2const, pitch->fft_fwd_cfg, Sw, Sn, pitch->w);
|
||||
two_stage_pitch_refinement(&pitch->c2const, &model, Sw);
|
||||
estimate_amplitudes(&model, Sw, pitch->W, 1);
|
||||
snr = est_voicing_mbe(&pitch->c2const, &model, Sw, pitch->W);
|
||||
*snr = est_voicing_mbe(&pitch->c2const, &model, Sw, pitch->W);
|
||||
|
||||
*voicing = 1.0 - 2.0/pow(10.0, snr/10.0);
|
||||
*voicing = 1.0 - 2.0/pow(10.0, *snr/10.0);
|
||||
if (*voicing < 0.0) *voicing = 0.0;
|
||||
return (int)2*pitch_samples;
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
typedef struct CODEC2_PITCH_S CODEC2_PITCH;
|
||||
CODEC2_PITCH *codec2_pitch_create(int *Sn_size, int *new_samples_each_call);
|
||||
int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing);
|
||||
int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing, float *snr);
|
||||
void codec2_pitch_destroy(CODEC2_PITCH *pitch);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -293,6 +293,7 @@ int main(int argc, char **argv) {
|
|||
float noise_std=0;
|
||||
int training = -1;
|
||||
int c2pitch_en = 0;
|
||||
int c2voicing_en = 0;
|
||||
int nvec = 5000000;
|
||||
float delta_f0 = 0.0;
|
||||
int fuzz = 1;
|
||||
|
@ -321,6 +322,7 @@ int main(int argc, char **argv) {
|
|||
{"train", no_argument, 0, 'r'},
|
||||
{"short", required_argument, 0, 's'},
|
||||
{"test", no_argument, 0, 't'},
|
||||
{"c2voicing", no_argument, 0, 'v'},
|
||||
{"fuzz", required_argument, 0, 'z'},
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
|
@ -331,6 +333,9 @@ int main(int argc, char **argv) {
|
|||
case 'c':
|
||||
c2pitch_en = 1;
|
||||
break;
|
||||
case 'v':
|
||||
c2voicing_en = 1;
|
||||
break;
|
||||
case 'f':
|
||||
dump_fft = 1;
|
||||
f_fft = fopen(optarg, "wb");
|
||||
|
@ -408,6 +413,7 @@ int main(int argc, char **argv) {
|
|||
fprintf(stderr, " -z --fuzz fuzz freq response and gain during training (default on)\n");
|
||||
fprintf(stderr, " -f --dumpfft FileName dump a file of fft log energy samples\n");
|
||||
fprintf(stderr, " -s --short FileName dump (ulaw) pcm file in 16 bit short format as well\n");
|
||||
fprintf(stderr, " -c --c2voicing Codec 2 voicing estimator\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
@ -514,15 +520,15 @@ int main(int argc, char **argv) {
|
|||
c2_Sn[i] = c2_Sn[i+c2_frame_size];
|
||||
for(i=0; i<c2_frame_size; i++)
|
||||
c2_Sn[i+c2_Sn_size-c2_frame_size] = x[i];
|
||||
float f0, voicing; int pitch_index;
|
||||
pitch_index = codec2_pitch_est(c2pitch, c2_Sn, &f0, &voicing);
|
||||
if (pitch_index >= 2*PITCH_MAX_PERIOD) pitch_index = 2*PITCH_MAX_PERIOD-1;
|
||||
if (pitch_index < 2*PITCH_MIN_PERIOD) pitch_index = 2*PITCH_MIN_PERIOD;
|
||||
assert(pitch_index < 2*PITCH_MAX_PERIOD);
|
||||
assert(pitch_index >= 2*PITCH_MIN_PERIOD);
|
||||
float f0, voicing, snr; int pitch_index;
|
||||
pitch_index = codec2_pitch_est(c2pitch, c2_Sn, &f0, &voicing, &snr);
|
||||
if (pitch_index >= 2*PITCH_MAX_PERIOD) pitch_index = 2*PITCH_MAX_PERIOD-1;
|
||||
if (pitch_index < 2*PITCH_MIN_PERIOD) pitch_index = 2*PITCH_MIN_PERIOD;
|
||||
assert(pitch_index < 2*PITCH_MAX_PERIOD);
|
||||
assert(pitch_index >= 2*PITCH_MIN_PERIOD);
|
||||
|
||||
features[2*NB_BANDS] = 0.01*(pitch_index-200);
|
||||
// Tried using Codec 2 voicing est but poor results
|
||||
// features[2*NB_BANDS+1] = voicing;
|
||||
if (c2voicing_en) features[2*NB_BANDS+1] = voicing;
|
||||
//int pitch_index_lpcnet = 100*features[2*NB_BANDS] + 200;
|
||||
//fprintf(stderr, "%f %d %d v: %f %f\n", f0, pitch_index, pitch_index, features[2*NB_BANDS+1], voicing);
|
||||
}
|
||||
|
|
|
@ -65,8 +65,6 @@ void compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
|
|||
int j;
|
||||
int band_size;
|
||||
band_size = (eband5ms[i+1]-eband5ms[i])*WINDOW_SIZE_5MS;
|
||||
//fprintf(stderr, "i: %d band_size: %d eband5ms[i]*WINDOW_SIZE_5MS: %d \n", i, band_size, eband5ms[i]*WINDOW_SIZE_5MS);
|
||||
// frac implements a triangular window of FFT energy
|
||||
for (j=0;j<band_size;j++) {
|
||||
float tmp;
|
||||
float frac = (float)j/band_size;
|
||||
|
@ -74,7 +72,6 @@ void compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
|
|||
tmp += SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].i);
|
||||
sum[i] += (1-frac)*tmp;
|
||||
sum[i+1] += frac*tmp;
|
||||
//fprintf(stderr, "i: %d j: %d frac: %4.3f\n", i, j, frac);
|
||||
}
|
||||
}
|
||||
sum[0] *= 2;
|
||||
|
|
|
@ -154,10 +154,11 @@ void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features
|
|||
run_frame_network(lpcnet, condition, gru_a_condition, features, pitch);
|
||||
memcpy(lpc, lpcnet->old_lpc[FEATURES_DELAY-1], LPC_ORDER*sizeof(lpc[0]));
|
||||
memmove(lpcnet->old_lpc[1], lpcnet->old_lpc[0], (FEATURES_DELAY-1)*LPC_ORDER*sizeof(lpc[0]));
|
||||
|
||||
if (logmag) {
|
||||
float tmp[NB_BANDS];
|
||||
for (i=0;i<NB_BANDS;i++) tmp[i] = pow(10.f, features[i]);
|
||||
lpc_from_bands(lpcnet->old_lpc[0], tmp);
|
||||
float tmp[NB_BANDS];
|
||||
for (i=0;i<NB_BANDS;i++) tmp[i] = pow(10.f, features[i]);
|
||||
lpc_from_bands(lpcnet->old_lpc[0], tmp);
|
||||
}
|
||||
else
|
||||
lpc_from_cepstrum(lpcnet->old_lpc[0], features);
|
||||
|
@ -204,6 +205,7 @@ void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features
|
|||
pred_ulaw = lin2ulaw(pred);
|
||||
run_sample_network(&lpcnet->nnet, pdf, condition, gru_a_condition, lpcnet->last_exc, last_sig_ulaw, pred_ulaw);
|
||||
exc = sample_from_pdf(pdf, DUAL_FC_OUT_SIZE, MAX16(0, 1.5f*pitch_gain - .5f), PDF_FLOOR);
|
||||
//exc = sample_from_pdf(pdf, DUAL_FC_OUT_SIZE, 0.5*pitch_gain-0.5, PDF_FLOOR);
|
||||
pcm = pred + ulaw2lin(exc);
|
||||
RNN_MOVE(&lpcnet->last_sig[1], &lpcnet->last_sig[0], LPC_ORDER-1);
|
||||
lpcnet->last_sig[0] = pcm;
|
||||
|
|
|
@ -207,6 +207,7 @@ LPCNET_DUMP *lpcnet_dump_create(void) {
|
|||
assert(d->c2_Sn != NULL);
|
||||
for(i=0; i<d->c2_Sn_size; i++) d->c2_Sn[i] = 0.0;
|
||||
|
||||
d->c2voicing = 0;
|
||||
assert(LPCNET_NB_FEATURES == NB_FEATURES);
|
||||
return d;
|
||||
}
|
||||
|
@ -231,7 +232,7 @@ void lpcnet_dump(LPCNET_DUMP *d, float x[], float features[])
|
|||
for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
|
||||
compute_frame_features(d->st, X, Ex, features, x);
|
||||
|
||||
/* inject pitch from Codec 2 pitch estimator */
|
||||
/* inject pitch and (optionally) voicing from Codec 2 pitch estimator */
|
||||
|
||||
int c2_Sn_size = d->c2_Sn_size;
|
||||
int c2_frame_size = d->c2_frame_size;
|
||||
|
@ -240,10 +241,15 @@ void lpcnet_dump(LPCNET_DUMP *d, float x[], float features[])
|
|||
c2_Sn[i] = c2_Sn[i+c2_frame_size];
|
||||
for(i=0; i<c2_frame_size; i++)
|
||||
c2_Sn[i+c2_Sn_size-c2_frame_size] = x[i];
|
||||
float f0, voicing; int pitch_index;
|
||||
pitch_index = codec2_pitch_est(d->c2pitch, c2_Sn, &f0, &voicing);
|
||||
|
||||
float f0, voicing, snr; int pitch_index;
|
||||
pitch_index = codec2_pitch_est(d->c2pitch, c2_Sn, &f0, &voicing, &snr);
|
||||
if (pitch_index >= 2*PITCH_MAX_PERIOD) pitch_index = 2*PITCH_MAX_PERIOD-1;
|
||||
if (pitch_index < 2*PITCH_MIN_PERIOD) pitch_index = 2*PITCH_MIN_PERIOD;
|
||||
|
||||
features[2*NB_BANDS] = 0.01*(pitch_index-200);
|
||||
if (d->c2voicing)
|
||||
features[2*NB_BANDS+1] = voicing;
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -50,6 +50,7 @@ typedef struct {
|
|||
CODEC2_PITCH *c2pitch;
|
||||
int c2_Sn_size, c2_frame_size;
|
||||
float *c2_Sn;
|
||||
int c2voicing;
|
||||
} LPCNET_DUMP;
|
||||
|
||||
LPCNET_DUMP *lpcnet_dump_create(void);
|
||||
|
|
|
@ -3,11 +3,63 @@
|
|||
%
|
||||
% Octave script to plot features against time
|
||||
|
||||
nb_lpcnet_features=55;
|
||||
nb_lpcnet_bands=18;
|
||||
Fs = 16000;
|
||||
Fsf = 100;
|
||||
Fssv = 50;
|
||||
function plot_features(fn_speech, fn_feat, fn_feat2)
|
||||
nb_lpcnet_features=55;
|
||||
nb_lpcnet_bands=18;
|
||||
Fs = 16000;
|
||||
Fsf = 100;
|
||||
Fssv = 50;
|
||||
|
||||
st_sec=1; en_sec=2;
|
||||
|
||||
feat_lpcnet=load_f32(fn_feat, nb_lpcnet_features);
|
||||
dctLy = feat_lpcnet(:,1:nb_lpcnet_bands);
|
||||
dctLydB = 10*dctLy;
|
||||
pitch_index_lpcnet = 100*feat_lpcnet(:,2*nb_lpcnet_bands+1) + 200;
|
||||
f0 = 2*Fs ./ pitch_index_lpcnet;
|
||||
gain = feat_lpcnet(:,2*nb_lpcnet_bands+2);
|
||||
%snr = feat_lpcnet(:,2*nb_lpcnet_bands+3);
|
||||
voicing = feat_lpcnet(:,2*nb_lpcnet_bands+3);
|
||||
|
||||
fs=fopen(fn_speech,"rb");
|
||||
s = fread(fs,Inf,"short");
|
||||
fclose(fs);
|
||||
|
||||
figure(1); clf;
|
||||
subplot(311);
|
||||
plot_against_time(s, st_sec, en_sec, Fs, 'b;speech;')
|
||||
subplot(312);
|
||||
plot_against_time(f0, st_sec, en_sec, Fsf, 'b;f0;');
|
||||
%plot_against_time(snr, st_sec, en_sec, Fsf, 'g;snr;');
|
||||
subplot(313);
|
||||
plot_against_time(gain, st_sec, en_sec, Fsf, 'b;gain;');
|
||||
hold on;
|
||||
plot_against_time(voicing, st_sec, en_sec, Fsf, 'g;voicing;');
|
||||
%plot_against_time(snr, st_sec, en_sec, Fsf, 'g;snr;');
|
||||
%axis([st_sec en_sec 0 10])
|
||||
hold off;
|
||||
|
||||
LydB = idct(dctLydB')';
|
||||
figure(2); clf;
|
||||
mesh_against_time(LydB, st_sec, en_sec, Fsf); title('LydB');
|
||||
|
||||
figure(3); clf;
|
||||
bar(mean(LydB)); title ('mean LydB'); axis([1 18 0 25]);
|
||||
figure(4); clf;
|
||||
subplot(211); hist(gain,0:0.1:1); title ('gain');
|
||||
|
||||
if nargin == 3
|
||||
feat_lpcnet2=load_f32(fn_feat2, nb_lpcnet_features);
|
||||
dctLy2 = feat_lpcnet2(:,1:nb_lpcnet_bands);
|
||||
dctLydB2 = 10*dctLy2;
|
||||
LydB2 = idct(dctLydB2')';
|
||||
figure(5); clf;
|
||||
bar(mean(LydB2)); title ('mean LydB2'); axis([1 18 0 25]);
|
||||
figure(4); subplot(212);
|
||||
gain2 = feat_lpcnet2(:,2*nb_lpcnet_bands+2);
|
||||
hist(gain2,0:0.1:1);
|
||||
end
|
||||
endfunction
|
||||
|
||||
function plot_against_time(v, st_sec, en_sec, Fs, leg)
|
||||
st = Fs*st_sec; en = Fs*en_sec;
|
||||
|
@ -21,34 +73,3 @@ function mesh_against_time(m, st_sec, en_sec, Fs)
|
|||
mesh(m(st+1:en+1,:));
|
||||
endfunction
|
||||
|
||||
feat_lpcnet=load_f32("../birch.f32", nb_lpcnet_features);
|
||||
dctLy = feat_lpcnet(:,1:nb_lpcnet_bands);
|
||||
dctLydB = 10*dctLy;
|
||||
|
||||
fs=fopen("../speech_orig_16k_centre.s16","rb");
|
||||
s = fread(fs,Inf,"short");
|
||||
fclose(fs);
|
||||
|
||||
st_sec=0; en_sec=1;
|
||||
|
||||
figure(1); clf;
|
||||
subplot(211);
|
||||
plot_against_time(s, st_sec, en_sec, Fs, 'b')
|
||||
subplot(212);
|
||||
plot_against_time(dctLydB(:,1), st_sec, en_sec, Fsf, 'r');
|
||||
|
||||
figure(2); clf;
|
||||
mesh_against_time(dctLydB(:,1:5), st_sec, en_sec, Fsf);
|
||||
|
||||
LydB = idct(dctLydB')';
|
||||
figure(3); clf;
|
||||
mesh_against_time(LydB, st_sec, en_sec, Fsf);
|
||||
|
||||
dc = mean(LydB')';
|
||||
figure(4);
|
||||
subplot(211);
|
||||
plot_against_time(dctLydB(:,1), st_sec, en_sec, Fsf, 'r');
|
||||
subplot(212);
|
||||
plot_against_time(dc, st_sec, en_sec, Fsf, 'r');
|
||||
|
||||
printf("mean dctLydB(:,1): %f mean dc: %f\n", mean(dctLydB(:,1)), mean(dc));
|
||||
|
|
|
@ -34,7 +34,7 @@ int main(int argc, char *argv[])
|
|||
float Sn[Sn_size]; /* float buffer of input speech samples */
|
||||
FILE *fin,*fout;
|
||||
int pitch_samples;
|
||||
float f0, voicing;
|
||||
float f0, voicing, snr;
|
||||
int i;
|
||||
|
||||
/* Input file */
|
||||
|
@ -63,7 +63,7 @@ int main(int argc, char *argv[])
|
|||
Sn[i] = Sn[i+new_samples_each_call];
|
||||
for(i=0; i<new_samples_each_call; i++)
|
||||
Sn[i+Sn_size-new_samples_each_call] = buf[i];
|
||||
pitch_samples = codec2_pitch_est(c2_pitch, Sn, &f0, &voicing);
|
||||
pitch_samples = codec2_pitch_est(c2_pitch, Sn, &f0, &voicing, &snr);
|
||||
|
||||
fprintf(fout,"%f %d\n", f0, pitch_samples);
|
||||
}
|
||||
|
|
|
@ -40,10 +40,10 @@ int main(int argc, char **argv) {
|
|||
int o = 0;
|
||||
int opt_idx = 0;
|
||||
while( o != -1 ) {
|
||||
static struct option long_opts[] = {
|
||||
{"mag", no_argument,0, 'i'},
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
static struct option long_opts[] = {
|
||||
{"mag", no_argument,0, 'i'},
|
||||
{0, 0, 0, 0}
|
||||
};
|
||||
|
||||
o = getopt_long(argc,argv,"ih",long_opts,&opt_idx);
|
||||
|
||||
|
@ -85,7 +85,6 @@ int main(int argc, char **argv) {
|
|||
|
||||
net = lpcnet_create();
|
||||
lpcnet_open_test_file(net, "test_lpcnet_states.f32");
|
||||
|
||||
while (1) {
|
||||
float in_features[NB_TOTAL_FEATURES];
|
||||
float features[NB_FEATURES];
|
||||
|
|
Loading…
Reference in New Issue