adding support for split VQ

2019-03-21 18:41:39 +10:30 · 2019-03-21 18:41:39 +10:30 · 446b5afd13
parent 52a2198ad7
commit 446b5afd13
4 changed files with 125 additions and 24 deletions
--- a/9
+++ b/9
@ -21,7 +21,7 @@ CFLAGS+=-mfpu=neon -march=armv8-a -mtune=cortex-a53
 endif

 PROG=dump_data test_lpcnet test_vec quant_feat tcodec2_pitch weight tdump tweak_pitch quant_test \
-     quant2c diff32 quant_enc quant_dec lpcnet_enc lpcnet_dec
+     quant2c diff32 quant_enc quant_dec lpcnet_enc lpcnet_dec idct
 all: $(PROG)

 dump_data_objs := src/dump_data.o src/freq.o src/kiss_fft.o src/pitch.o src/celt_lpc.o src/codec2_pitch.o
@ -106,9 +106,14 @@ weight_objs := src/weight.o
 weight_deps := $(weight_objs:.o=.d)
 weight: $(weight_objs)
 	gcc -o $@ $(CFLAGS) $(weight_objs) -lm
-
 -include $(weight_deps)

+idct_objs := src/idct.o src/freq.o src/kiss_fft.o src/celt_lpc.o src/pitch.o
+idct_deps := $(idct_objs:.o=.d)
+idct: $(idct_objs)
+	gcc -o $@ $(CFLAGS) $(idct_objs) -lm
+-include $(idct_deps)
+
 tweak_pitch_objs := src/tweak_pitch.o
 tweak_pitch_deps := $(tweak_pitch_objs:.o=.d)
 tweak_pitch: $(tweak_pitch_objs)
--- a/src/idct.c
+++ b/src/idct.c
@ -0,0 +1,28 @@
+/*
+  idct.c
+  David Rowe Mar 2019
+
+  inverse DCT so we can experiment with training in the Ly (log magnitude) domain.
+*/
+
+#include <assert.h>
+#include <stdio.h>
+#include <math.h>
+#include "freq.h"
+
+#define NB_BANDS 18
+
+int main(void) {
+    FILE *fin, *fout;
+    float dctLy[NB_BANDS], Ly[NB_BANDS];
+    fin = stdin; fout = stdout;
+    int ret;
+    
+    while(fread(dctLy, sizeof(float), NB_BANDS, fin) == NB_BANDS) {
+        idct(Ly, dctLy);
+        ret = fwrite(Ly, sizeof(float), NB_BANDS, fout);
+        assert(ret == NB_BANDS);
+    }
+
+    return 0;
+}
--- a/src/quant_feat.c
+++ b/src/quant_feat.c
@ -52,6 +52,7 @@ int main(int argc, char *argv[]) {
    FILE *fpitch = NULL;
    float Fs = 16000.0;
    float uniform_step = 0.0;
+    float uniform_step2 = 0.0;
    int   mbest_survivors = 0;
    char label[80] = "";
    /* experimental limits for dctLy[0], first cepstral */
@ -62,6 +63,7 @@ int main(int argc, char *argv[]) {
    float pitch_gain_bias = 0.0;
    int   pitch_bits = 0;
    int   small_vec = 0;
+    int   logmag = 0;
    
    for(i=0; i<MAX_STAGES*NB_BANDS*MAX_ENTRIES; i++) vq[i] = 0.0;
    
@ -74,19 +76,21 @@ int main(int argc, char *argv[]) {
        {"hard",       required_argument, 0, 'h'},
        {"label",      required_argument, 0, 'l'},
        {"mbest",      required_argument, 0, 'm'},
+        {"mag",        required_argument, 0, 'i'},
        {"pitchquant", required_argument, 0, 'o'},
        {"pred",       required_argument, 0, 'p'},
        {"quant",      required_argument, 0, 'q'},
        {"stagevar",   required_argument, 0, 's'},
        {"uniform",    required_argument, 0, 'u'},
        {"verbose",    no_argument,       0, 'v'},
+        {"uniform2",   required_argument, 0, 'x'},
        {"weight",     no_argument,       0, 'w'},
        {0, 0, 0, 0}
    };

    int opt_index = 0;
    
-    while ((c = getopt_long (argc, argv, "ad:q:vs:f:p:e:u:l:m:h:wg:o:", long_options, &opt_index)) != -1) {
+    while ((c = getopt_long (argc, argv, "ad:q:vs:f:p:e:u:l:m:h:wg:o:ix:", long_options, &opt_index)) != -1) {
        switch (c) {
        case 'a':
            /* small cpectral vectors - zero out several bands */
@ -120,6 +124,10 @@ int main(int argc, char *argv[]) {
            lower_limit = atof(optarg);            
            fprintf(stderr, "lower_limit: %f upper_limit: %f\n", lower_limit, upper_limit);
            break;
+        case 'i':
+            /* work in log mag rather than cepstral domain */
+            logmag = 1;
+            break;
        case 'l':
            /* text label to pront with results */
            strcpy(label, optarg);
@ -156,10 +164,13 @@ int main(int argc, char *argv[]) {
                    fprintf(stderr, "Couldn't open: %s\n", fn);
                    exit(1);
                }
+                /* count how many entries m of dimension k are in this VQ file */
                m[num_stages] = 0;
-                while (fread(features, sizeof(float), k, fq) == (size_t)k) m[num_stages]++;
+                while (fread(features, sizeof(float), k, fq) == (size_t)k)
+                    m[num_stages]++;
                assert(m[num_stages] <= MAX_ENTRIES);
                fprintf(stderr, "%d entries of vectors width %d\n", m[num_stages], k);
+                /* now load VQ into memory */
                rewind(fq);                       
                int rd = fread(&vq[num_stages*k*MAX_ENTRIES], sizeof(float), m[num_stages]*k, fq);
                assert(rd == m[num_stages]*k);
@ -170,6 +181,11 @@ int main(int argc, char *argv[]) {
        case 'u':
            uniform_step = atof(optarg);
            fprintf(stderr, "uniform quant step size: %3.2f dB\n", uniform_step);
+            uniform_step2 = uniform_step;
+            break;
+        case 'x':
+            uniform_step2 = atof(optarg);
+            fprintf(stderr, "uniform quant step size 12..17: %3.2f dB\n", uniform_step2);
            break;
        case 'v':
            lpcnet_verbose = 1;
@ -181,6 +197,7 @@ int main(int argc, char *argv[]) {
            fprintf(stderr,"usage: %s [Options]:\n  [-d --decimation 1/2/3...]\n  [-q --quant quantfile1,quantfile2,....]\n", argv[0]);
            fprintf(stderr,"  [-g --gain pitch gain bias]\n");
            fprintf(stderr,"  [-h --hard lowerLimitdB\n");
+            fprintf(stderr,"  [-i --mag\n");
            fprintf(stderr,"  [-l --label txtLabel]\n");
            fprintf(stderr,"  [-m --mbest survivors]\n  [-o --pitchbits nBits]\n");
            fprintf(stderr,"  [-p --pred predCoff]\n  [-f --first firstElement]\n  [-s --stagevar TxtFile]\n");
@ -190,19 +207,14 @@ int main(int argc, char *argv[]) {
        }
    }

-    fprintf(stderr, "dec: %d pred: %3.2f num_stages: %d mbest: %d small: %d", dec, pred, num_stages, mbest_survivors, small_vec);
+    fprintf(stderr, "dec: %d pred: %3.2f num_stages: %d mbest: %d small: %d logmag: %d",
+            dec, pred, num_stages, mbest_survivors, small_vec, logmag);
    fprintf(stderr, "\n");
    
    /* delay line so we can pass some features (like pitch and voicing) through unmodified */
    float features_prev[dec+1][NB_FEATURES];
    /* adjacent vectors used for linear interpolation, note only 0..17 and 38,39 used */
    float features_lin[2][NB_FEATURES];
-    /* used for optiona smoothing of features */
-    /*
-    float features_mem[NB_BANDS];
-    for(i=0; i<NB_BANDS; i++)
-        features_mem[i] = 0.0;
-    */
    
    for(d=0; d<dec+1; d++)
        for(i=0; i<NB_FEATURES; i++)
@ -258,6 +270,13 @@ int main(int argc, char *argv[]) {
            }
        }

+        /* optionally convert cepstrals to log magnitudes */
+        if (logmag) {
+            float tmp[NB_BANDS];
+            idct(tmp, features);
+            for(i=0; i<NB_BANDS; i++) features[i] = tmp[i];
+        }
+        
        /* convert cepstrals to dB */
        for(i=0; i<NB_BANDS; i++)
            features[i] *= 10.0;
@ -324,9 +343,11 @@ int main(int argc, char *argv[]) {
                        features_quant[i] = features[i];
                }
                if (uniform_step != 0.0) {
-                    for(i=0; i<NB_BANDS; i++) {
+                    for(i=0; i<12; i++) {
                        features_quant[i] = uniform_step*round(features[i]/uniform_step);
-                        //fprintf(stderr, "%d %f %f\n", i, features[i], features_quant[i]);
+                    }
+                    for(; i<NB_BANDS; i++) {
+                        features_quant[i] = uniform_step2*round(features[i]/uniform_step2);
                    }
                }
            }
@ -400,6 +421,13 @@ int main(int argc, char *argv[]) {
        for(i=0; i<NB_BANDS; i++)
            features_out[i] *= 1/10.0;

+        /* if optionally log magnitudes convert back to cepstrals */
+        if (logmag) {
+            float tmp[NB_BANDS];
+            dct(tmp, features_out);
+            for(i=0; i<NB_BANDS; i++) features_out[i] = tmp[i];
+       }
+
        /* need to recompute LPCs after every frame, as we have quantised, or interpolated */
        lpc_from_cepstrum(&features_out[2*NB_BANDS+3], features_out);

--- a/train_direct.sh
+++ b/train_direct.sh
@ -1,17 +1,57 @@
 #!/bin/sh
+# train_direct.sh
+# David Rowe March 2019
+# Train multi-stage VQ direct (non predictive) for LPCNet

-PATH=/home/david/codec2-dev/build_linux/misc/
-VQTRAIN=$PATH/vqtrain
-EXTRACT=$PATH/extract
-VQTRAIN=/home/david/codec2-dev/build_linux/misc/vqtrain
-K=8
+PATH=$PATH:/home/david/codec2-dev/build_linux/misc/
+
+if [ $# -lt 1 ]; then
+    echo "usage: ./train_direct.sh [-i] VQprefix"
+    echo "       $ ./train_direct.sh direct_v1"
+    echo "  -i   work in Ly (log magnitude) domain"
+    exit 1
+fi
+
+for i in "$@"
+do
+case $i in
+    -i)
+        LOGMAG=1
+        shift # past argument=value
+    ;;
+esac
+done
+
+VQ_NAME=$1
+echo $VQ_NAME
+
+K=18
+FINAL_K=12
+STOP=1E-3

 echo "*********"
 echo "Direct"
 echo "*********"
-$EXTRACT all_speech_features.f32 all_speech_direct.f32 0 7 10 0
-$VQTRAIN all_speech_direct.f32 $K 2048 direct_stage1.f32 sd1.f32
-$VQTRAIN sd1.f32 $K 2048 direct_stage2.f32 sd2.f32
-$VQTRAIN sd2.f32 $K 2048 direct_stage3.f32 sd3.f32
-$VQTRAIN sd3.f32 $K 2048 direct_stage4.f32 sd4.f32
-$VQTRAIN sd4.f32 $K 2048 direct_stage5.f32 sd5.f32
+t=$(mktemp)
+extract -e `expr $K - 1` -g 10 all_speech_features_5e6.f32 $t 
+if [ -z "$LOGMAG" ]; then
+  echo "weighting dctLy[0] ...."
+  cat $t | ./weight > $VQ_NAME'_s0.f32'
+else
+  echo "working in Ly (log magnitude) domain"
+  cat $t | ./idct > $VQ_NAME'_s0.f32'
+fi
+  
+vqtrain $VQ_NAME'_s0.f32' $K 2048 $VQ_NAME'_stage1.f32' -r $VQ_NAME'_s1.f32' -s $STOP 
+vqtrain $VQ_NAME'_s1.f32' $K 2048 $VQ_NAME'_stage2.f32' -r $VQ_NAME'_s2.f32' -s $STOP
+vqtrain $VQ_NAME'_s2.f32' $K 2048 $VQ_NAME'_stage3.f32' -r $VQ_NAME'_s3.f32' -s $STOP 
+if [ -z "$LOGMAG" ]; then
+  echo "final two stages $K elements"
+  vqtrain $VQ_NAME'_s3.f32' $K 2048 $VQ_NAME'_stage4.f32' -r $VQ_NAME'_s5.f32' -s $STOP 
+  vqtrain $VQ_NAME'_s4.f32' $K 2048 $VQ_NAME'_stage5.f32' -r $VQ_NAME'_s6.f32' -s $STOP 
+else
+  echo "final stage $FINAL_K elements"
+  t=$(mktemp)
+  extract -e `expr $FINAL_K - 1` -t $K $VQ_NAME'_s3.f32' $t 
+  vqtrain $t $FINAL_K 2048 $VQ_NAME'_stage4.f32' -r $VQ_NAME'_s5.f32' -s $STOP 
+fi