resolved merge on deep

2019-07-27 11:48:14 +09:30 · 2019-07-27 11:48:14 +09:30 · b8e18d096c
parent 6bd67c76fb 70aded5485
commit b8e18d096c
12 changed files with 103 additions and 78 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -19,7 +19,7 @@ install:
 script:
    # First build and install vanilla codec2 as we need -lcodec2 to build LPCNet
    - git clone https://github.com/drowe67/codec2.git
-    - cd codec2 && git checkout brad-2020
+    - cd codec2
    - mkdir build_linux && cd build_linux && cmake .. && make VERBOSE=1 codec2
    # OK, build and test LPCNet
    - cd $LPCNETDIR && mkdir -p $BUILDDIR && cd $BUILDDIR 
--- a/README.md
+++ b/README.md
@ -6,10 +6,11 @@ Experimental version of LPCNet being developed for over the air Digital Voice ex

 ```
 $ git clone https://github.com/drowe67/codec2.git
-$ cd codec2 && mkdir build_linux && cd build_linux && cmake ../ && sudo make install
+$ cd codec2 && mkdir build_linux && cd build_linux && cmake ../ && make
 $ cd ~
 $ git clone https://github.com/drowe67/LPCNet.git
-$ cd LPCNet && mkdir build_linux && cd build_linux && cmake ..
+$ cd LPCNet && mkdir build_linux && cd build_linux
+$ cmake -DCODEC2_BUILD_DIR=~/codec2/build_linux ..
 $ make
 ```

@ -22,7 +23,7 @@ $ sox ../../wav/wia.wav -t raw -r 16000 - | ./dump_data --c2pitch --test - - | .

 LPCNet at 1733 bits/s using direct-split quantiser:
 ```
-$ sox ../../wav/wia.wav -t raw -r 16000 - | ./lpcnet_enc -s | ./lpcnet_dec -s | aplay -f S16_LE -r 16000
+sox ../../wav/wia.wav -t raw -r 16000 - | ./lpcnet_enc -s | ./lpcnet_dec -s | aplay -f S16_LE -r 16000
 ```

 # Reading Further
@ -37,7 +38,7 @@ Thanks [Jean-Marc Valin](https://people.xiph.org/~jm/demo/lpcnet/) for making LP

 # Cross Compiling for Windows

-This code has been cross compiled to Windows using Fedora Linux 30, see the freedv-gui README.md, and build_winows.sh script.
+This code has been cross compiled to Windows using Fedora Linux 30, see the freedv-gui README.md, and build_windows.sh script.

 # Speech Material for Training

@ -48,14 +49,6 @@ cd 16k-LP7
 sh /path/to/LPCNet/src/concat.sh
 ```

-## Playing files on a remote machine
-
-I use a server for training, but my laptop for listening:
-
-```
-david@laptop:~$ scp server:LPCNet/build_linux/speech_orig_16kb.raw /dev/stdout | aplay -f S16_LE -r 16000
-```
-
 # Quantiser Experiments

 The quantiser files used for these experiments (pred_v2.tgz and split.tgz) are [here](http://rowetel.com/downloads/deep/lpcnet_quant)
--- a/src/codec2_pitch.c
+++ b/src/codec2_pitch.c
@ -92,10 +92,10 @@ CODEC2_PITCH *codec2_pitch_create(int *Sn_size, int *new_samples_each_call)

 /* returns an estimate of the pitch period, input is a buffer of samples on length pitch->m */

-int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing)
+int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing, float *snr)
 {
    COMP  Sw[FFT_ENC];	        /* DFT of Sn[] */
-    float pitch_samples, snr;
+    float pitch_samples;
    MODEL model;
    
    *f0 = nlp(pitch->nlp_states, Sn, pitch->c2const.n_samp, &pitch_samples, Sw, pitch->W, &pitch->prev_f0);
@ -103,9 +103,9 @@ int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing)
    dft_speech(&pitch->c2const, pitch->fft_fwd_cfg, Sw, Sn, pitch->w);
    two_stage_pitch_refinement(&pitch->c2const, &model, Sw);
    estimate_amplitudes(&model, Sw, pitch->W, 1);
-    snr = est_voicing_mbe(&pitch->c2const, &model, Sw, pitch->W);
+    *snr = est_voicing_mbe(&pitch->c2const, &model, Sw, pitch->W);

-    *voicing = 1.0 - 2.0/pow(10.0, snr/10.0);
+    *voicing = 1.0 - 2.0/pow(10.0, *snr/10.0);
    if (*voicing < 0.0) *voicing = 0.0;
    return (int)2*pitch_samples;
 }
--- a/src/codec2_pitch.h
+++ b/src/codec2_pitch.h
@ -13,7 +13,7 @@

 typedef struct CODEC2_PITCH_S CODEC2_PITCH;
 CODEC2_PITCH *codec2_pitch_create(int *Sn_size, int *new_samples_each_call);
-int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing);
+int codec2_pitch_est(CODEC2_PITCH *pitch, float Sn[], float *f0, float *voicing, float *snr);
 void codec2_pitch_destroy(CODEC2_PITCH *pitch);

 #endif
--- a/src/dump_data.c
+++ b/src/dump_data.c
@ -293,6 +293,7 @@ int main(int argc, char **argv) {
  float noise_std=0;
  int training = -1;
  int c2pitch_en = 0;
+  int c2voicing_en = 0;
  int nvec = 5000000;
  float delta_f0 = 0.0;
  int fuzz = 1;
@ -321,6 +322,7 @@ int main(int argc, char **argv) {
          {"train",     no_argument,       0, 'r'},
          {"short",     required_argument, 0, 's'},
          {"test",      no_argument,       0, 't'},
+          {"c2voicing", no_argument,       0, 'v'},
          {"fuzz",      required_argument, 0, 'z'},
          {0, 0, 0, 0}
      };
@ -331,6 +333,9 @@ int main(int argc, char **argv) {
      case 'c':
          c2pitch_en = 1;
          break;
+      case 'v':
+          c2voicing_en = 1;
+          break;
      case 'f':
          dump_fft = 1;
          f_fft = fopen(optarg, "wb");
@ -408,6 +413,7 @@ int main(int argc, char **argv) {
      fprintf(stderr, "  -z --fuzz             fuzz freq response and gain during training (default on)\n");
      fprintf(stderr, "  -f --dumpfft FileName dump a file of fft log energy samples\n");
      fprintf(stderr, "  -s --short   FileName dump (ulaw) pcm file in 16 bit short format as well\n");
+      fprintf(stderr, "  -c --c2voicing        Codec 2 voicing estimator\n");
      exit(1);
  }
    
@ -514,15 +520,15 @@ int main(int argc, char **argv) {
            c2_Sn[i] = c2_Sn[i+c2_frame_size];
        for(i=0; i<c2_frame_size; i++)
            c2_Sn[i+c2_Sn_size-c2_frame_size] = x[i];
-        float f0, voicing; int pitch_index;
-        pitch_index = codec2_pitch_est(c2pitch, c2_Sn, &f0, &voicing);
- 	if (pitch_index >= 2*PITCH_MAX_PERIOD) pitch_index = 2*PITCH_MAX_PERIOD-1;
-	if (pitch_index < 2*PITCH_MIN_PERIOD) pitch_index = 2*PITCH_MIN_PERIOD;
-	assert(pitch_index < 2*PITCH_MAX_PERIOD);
-	assert(pitch_index >= 2*PITCH_MIN_PERIOD);
+        float f0, voicing, snr; int pitch_index;
+        pitch_index = codec2_pitch_est(c2pitch, c2_Sn, &f0, &voicing, &snr);
+      	if (pitch_index >= 2*PITCH_MAX_PERIOD) pitch_index = 2*PITCH_MAX_PERIOD-1;
+	      if (pitch_index < 2*PITCH_MIN_PERIOD) pitch_index = 2*PITCH_MIN_PERIOD;
+	      assert(pitch_index < 2*PITCH_MAX_PERIOD);
+	      assert(pitch_index >= 2*PITCH_MIN_PERIOD);
+
        features[2*NB_BANDS] = 0.01*(pitch_index-200);
-        // Tried using Codec 2 voicing est but poor results
-        // features[2*NB_BANDS+1] = voicing;
+        if (c2voicing_en) features[2*NB_BANDS+1] = voicing;
        //int pitch_index_lpcnet = 100*features[2*NB_BANDS] + 200;        
        //fprintf(stderr, "%f %d %d v: %f %f\n", f0, pitch_index, pitch_index, features[2*NB_BANDS+1], voicing);
    }
--- a/src/freq.c
+++ b/src/freq.c
@ -65,8 +65,6 @@ void compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
    int j;
    int band_size;
    band_size = (eband5ms[i+1]-eband5ms[i])*WINDOW_SIZE_5MS;
-    //fprintf(stderr, "i: %d band_size: %d eband5ms[i]*WINDOW_SIZE_5MS: %d \n", i, band_size, eband5ms[i]*WINDOW_SIZE_5MS);
-    // frac implements a triangular window of FFT energy
    for (j=0;j<band_size;j++) {
      float tmp;
      float frac = (float)j/band_size;
@ -74,7 +72,6 @@ void compute_band_energy(float *bandE, const kiss_fft_cpx *X) {
      tmp += SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].i);
      sum[i] += (1-frac)*tmp;
      sum[i+1] += frac*tmp;
-      //fprintf(stderr, "i: %d j: %d frac: %4.3f\n", i, j, frac);
    }
  }
  sum[0] *= 2;
--- a/src/lpcnet.c
+++ b/src/lpcnet.c
@ -154,10 +154,11 @@ void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features
    run_frame_network(lpcnet, condition, gru_a_condition, features, pitch);
    memcpy(lpc, lpcnet->old_lpc[FEATURES_DELAY-1], LPC_ORDER*sizeof(lpc[0]));
    memmove(lpcnet->old_lpc[1], lpcnet->old_lpc[0], (FEATURES_DELAY-1)*LPC_ORDER*sizeof(lpc[0]));
+
    if (logmag) {
-	float tmp[NB_BANDS];
-	for (i=0;i<NB_BANDS;i++) tmp[i] = pow(10.f, features[i]);
-	lpc_from_bands(lpcnet->old_lpc[0], tmp);
+        float tmp[NB_BANDS];
+        for (i=0;i<NB_BANDS;i++) tmp[i] = pow(10.f, features[i]);
+        lpc_from_bands(lpcnet->old_lpc[0], tmp);
    }
    else
 	lpc_from_cepstrum(lpcnet->old_lpc[0], features);
@ -204,6 +205,7 @@ void lpcnet_synthesize(LPCNetState *lpcnet, short *output, const float *features
        pred_ulaw = lin2ulaw(pred);
        run_sample_network(&lpcnet->nnet, pdf, condition, gru_a_condition, lpcnet->last_exc, last_sig_ulaw, pred_ulaw);
        exc = sample_from_pdf(pdf, DUAL_FC_OUT_SIZE, MAX16(0, 1.5f*pitch_gain - .5f), PDF_FLOOR);
+        //exc = sample_from_pdf(pdf, DUAL_FC_OUT_SIZE, 0.5*pitch_gain-0.5, PDF_FLOOR);
        pcm = pred + ulaw2lin(exc);
        RNN_MOVE(&lpcnet->last_sig[1], &lpcnet->last_sig[0], LPC_ORDER-1);
        lpcnet->last_sig[0] = pcm;
--- a/src/lpcnet_dump.c
+++ b/src/lpcnet_dump.c
@ -207,6 +207,7 @@ LPCNET_DUMP *lpcnet_dump_create(void) {
    assert(d->c2_Sn != NULL);
    for(i=0; i<d->c2_Sn_size; i++) d->c2_Sn[i] = 0.0;

+    d->c2voicing = 0;
    assert(LPCNET_NB_FEATURES == NB_FEATURES);
    return d;
 }
@ -231,7 +232,7 @@ void lpcnet_dump(LPCNET_DUMP *d, float x[], float features[])
    for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
    compute_frame_features(d->st, X, Ex, features, x);

-    /* inject pitch from Codec 2 pitch estimator */
+    /* inject pitch and (optionally) voicing from Codec 2 pitch estimator */
    
    int c2_Sn_size = d->c2_Sn_size;
    int c2_frame_size = d->c2_frame_size;
@ -240,10 +241,15 @@ void lpcnet_dump(LPCNET_DUMP *d, float x[], float features[])
        c2_Sn[i] = c2_Sn[i+c2_frame_size];
    for(i=0; i<c2_frame_size; i++)
        c2_Sn[i+c2_Sn_size-c2_frame_size] = x[i];
-    float f0, voicing; int pitch_index;
-    pitch_index = codec2_pitch_est(d->c2pitch, c2_Sn, &f0, &voicing);
+
+    float f0, voicing, snr; int pitch_index;
+    pitch_index = codec2_pitch_est(d->c2pitch, c2_Sn, &f0, &voicing, &snr);
    if (pitch_index >= 2*PITCH_MAX_PERIOD) pitch_index = 2*PITCH_MAX_PERIOD-1;
    if (pitch_index < 2*PITCH_MIN_PERIOD) pitch_index = 2*PITCH_MIN_PERIOD;
+
    features[2*NB_BANDS] = 0.01*(pitch_index-200);
+    if (d->c2voicing)
+        features[2*NB_BANDS+1] = voicing;
+
 }
    
--- a/src/lpcnet_dump.h
+++ b/src/lpcnet_dump.h
@ -50,6 +50,7 @@ typedef struct {
    CODEC2_PITCH *c2pitch;
    int c2_Sn_size, c2_frame_size;
    float *c2_Sn;
+    int c2voicing;
 } LPCNET_DUMP;

 LPCNET_DUMP *lpcnet_dump_create(void);
--- a/src/plot_features.m
+++ b/src/plot_features.m
@ -3,11 +3,63 @@
 %
 % Octave script to plot features against time

-nb_lpcnet_features=55;
-nb_lpcnet_bands=18;
-Fs = 16000;
-Fsf = 100;
-Fssv = 50;
+function plot_features(fn_speech, fn_feat, fn_feat2)
+  nb_lpcnet_features=55;
+  nb_lpcnet_bands=18;
+  Fs = 16000;
+  Fsf = 100;
+  Fssv = 50;
+
+  st_sec=1; en_sec=2;
+
+  feat_lpcnet=load_f32(fn_feat, nb_lpcnet_features);
+  dctLy = feat_lpcnet(:,1:nb_lpcnet_bands);
+  dctLydB = 10*dctLy;
+  pitch_index_lpcnet = 100*feat_lpcnet(:,2*nb_lpcnet_bands+1) + 200;
+  f0 = 2*Fs ./ pitch_index_lpcnet;
+  gain = feat_lpcnet(:,2*nb_lpcnet_bands+2);
+  %snr = feat_lpcnet(:,2*nb_lpcnet_bands+3);
+  voicing = feat_lpcnet(:,2*nb_lpcnet_bands+3);
+
+  fs=fopen(fn_speech,"rb");
+  s = fread(fs,Inf,"short");
+  fclose(fs);
+
+  figure(1); clf;
+  subplot(311);
+  plot_against_time(s, st_sec, en_sec, Fs, 'b;speech;')
+  subplot(312);
+  plot_against_time(f0, st_sec, en_sec, Fsf, 'b;f0;');
+  %plot_against_time(snr, st_sec, en_sec, Fsf, 'g;snr;');
+  subplot(313);
+  plot_against_time(gain, st_sec, en_sec, Fsf, 'b;gain;');
+  hold on;
+  plot_against_time(voicing, st_sec, en_sec, Fsf, 'g;voicing;');
+  %plot_against_time(snr, st_sec, en_sec, Fsf, 'g;snr;');
+  %axis([st_sec en_sec 0 10])
+  hold off;
+  
+  LydB = idct(dctLydB')';
+  figure(2); clf;
+  mesh_against_time(LydB, st_sec, en_sec, Fsf); title('LydB');
+
+  figure(3); clf;
+  bar(mean(LydB)); title ('mean LydB'); axis([1 18 0 25]);
+  figure(4); clf;
+  subplot(211); hist(gain,0:0.1:1); title ('gain'); 
+
+  if nargin == 3
+    feat_lpcnet2=load_f32(fn_feat2, nb_lpcnet_features);
+    dctLy2 = feat_lpcnet2(:,1:nb_lpcnet_bands);
+    dctLydB2 = 10*dctLy2;
+    LydB2 = idct(dctLydB2')';
+    figure(5); clf;
+    bar(mean(LydB2)); title ('mean LydB2'); axis([1 18 0 25]);
+    figure(4); subplot(212);
+    gain2 = feat_lpcnet2(:,2*nb_lpcnet_bands+2);
+    hist(gain2,0:0.1:1);
+  end
+endfunction

 function plot_against_time(v, st_sec, en_sec, Fs, leg)
  st = Fs*st_sec; en = Fs*en_sec;
@ -21,34 +73,3 @@ function mesh_against_time(m, st_sec, en_sec, Fs)
  mesh(m(st+1:en+1,:));  
 endfunction

-feat_lpcnet=load_f32("../birch.f32", nb_lpcnet_features);
-dctLy = feat_lpcnet(:,1:nb_lpcnet_bands);
-dctLydB = 10*dctLy;
-
-fs=fopen("../speech_orig_16k_centre.s16","rb");
-s = fread(fs,Inf,"short");
-fclose(fs);
-
-st_sec=0; en_sec=1;
-
-figure(1); clf;
-subplot(211);
-plot_against_time(s, st_sec, en_sec, Fs, 'b')
-subplot(212);
-plot_against_time(dctLydB(:,1), st_sec, en_sec, Fsf, 'r');
-
-figure(2); clf;
-mesh_against_time(dctLydB(:,1:5), st_sec, en_sec, Fsf);
-
-LydB = idct(dctLydB')';
-figure(3); clf;
-mesh_against_time(LydB, st_sec, en_sec, Fsf);
-
-dc = mean(LydB')';
-figure(4);
-subplot(211);
-plot_against_time(dctLydB(:,1), st_sec, en_sec, Fsf, 'r');
-subplot(212);
-plot_against_time(dc, st_sec, en_sec, Fsf, 'r');
-
-printf("mean dctLydB(:,1): %f mean dc: %f\n", mean(dctLydB(:,1)), mean(dc));
--- a/src/tcodec2_pitch.c
+++ b/src/tcodec2_pitch.c
@ -34,7 +34,7 @@ int main(int argc, char *argv[])
    float Sn[Sn_size];	               /* float buffer of input speech samples */
    FILE *fin,*fout;
    int   pitch_samples;
-    float f0, voicing;    
+    float f0, voicing, snr;    
    int   i;

    /* Input file */
@ -63,7 +63,7 @@ int main(int argc, char *argv[])
        Sn[i] = Sn[i+new_samples_each_call];
      for(i=0; i<new_samples_each_call; i++)
        Sn[i+Sn_size-new_samples_each_call] = buf[i];
-      pitch_samples = codec2_pitch_est(c2_pitch, Sn, &f0, &voicing);
+      pitch_samples = codec2_pitch_est(c2_pitch, Sn, &f0, &voicing, &snr);

      fprintf(fout,"%f %d\n", f0, pitch_samples);
    }
--- a/src/test_lpcnet.c
+++ b/src/test_lpcnet.c
@ -40,10 +40,10 @@ int main(int argc, char **argv) {
    int o = 0;
    int opt_idx = 0;
    while( o != -1 ) {
-	static struct option long_opts[] = {
-	    {"mag", no_argument,0, 'i'},
-	    {0, 0, 0, 0}
-	};
+        static struct option long_opts[] = {
+            {"mag", no_argument,0, 'i'},
+            {0, 0, 0, 0}
+        };
        
 	o = getopt_long(argc,argv,"ih",long_opts,&opt_idx);
        
@ -85,7 +85,6 @@ int main(int argc, char **argv) {

    net = lpcnet_create();
    lpcnet_open_test_file(net, "test_lpcnet_states.f32");
-
    while (1) {
        float in_features[NB_TOTAL_FEATURES];
        float features[NB_FEATURES];