mirror of https://github.com/drowe67/LPCNet.git
458 lines
16 KiB
C
458 lines
16 KiB
C
/*
|
|
quant_feat.c
|
|
David Rowe Jan 2019
|
|
|
|
Tool for processing a .f32 file of LPCNet features to simulate quantisation:
|
|
|
|
1/ Can decimate cepstrals to 20/30/40/... ms update rate and
|
|
liniearly interpolate back up to 10ms
|
|
2/ Quantise using multistage VQs
|
|
3/ Replace the LPCNet pitch estimate with estimates from external files
|
|
4/ Works from stdin -> stdout to facilitate streaming real time simulations.
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <math.h>
|
|
#include <getopt.h>
|
|
|
|
#include "common.h"
|
|
#include "freq.h"
|
|
#include "lpcnet_quant.h"
|
|
|
|
#define NB_FEATURES 55
|
|
#define MAX_STAGES 5 /* max number of VQ stages */
|
|
#define NOUTLIERS 5 /* range of outilers to track in 1dB steps */
|
|
|
|
#define PITCH_MIN_PERIOD 32
|
|
#define PITCH_MAX_PERIOD 256
|
|
|
|
int main(int argc, char *argv[]) {
|
|
FILE *fin, *fout;
|
|
float features[NB_FEATURES], features_out[NB_FEATURES];
|
|
int f = 0, dec = 2;
|
|
float features_quant[NB_FEATURES];
|
|
int indexes[MAX_STAGES];
|
|
float sum_sq_err = 0.0;
|
|
int d,i,n = 0;
|
|
float fract;
|
|
|
|
int c, first = 0, k=NB_BANDS;
|
|
int num_stages = 0;
|
|
float vq[MAX_STAGES*NB_BANDS*MAX_ENTRIES];
|
|
int m[MAX_STAGES];
|
|
float pred = 0.9;
|
|
|
|
char fnames[256];
|
|
char fn[256];
|
|
char *comma, *p;
|
|
FILE *fq;
|
|
|
|
FILE *fpitch = NULL;
|
|
float Fs = 16000.0;
|
|
float uniform_step = 0.0;
|
|
float uniform_step2 = 0.0;
|
|
int mbest_survivors = 0;
|
|
char label[80] = "";
|
|
/* experimental limits for dctLy[0], first cepstral */
|
|
float lower_limit = -200.0;
|
|
float upper_limit = 200.00;
|
|
/* weight applied to first cepstral */
|
|
float weight = 1.0;
|
|
float pitch_gain_bias = 0.0;
|
|
int pitch_bits = 0;
|
|
int small_vec = 0;
|
|
int logmag = 0;
|
|
|
|
for(i=0; i<MAX_STAGES*NB_BANDS*MAX_ENTRIES; i++) vq[i] = 0.0;
|
|
|
|
static struct option long_options[] = {
|
|
{"small", required_argument, 0, 'a'},
|
|
{"decimate", required_argument, 0, 'd'},
|
|
{"extpitch", required_argument, 0, 'e'},
|
|
{"first", required_argument, 0, 'f'},
|
|
{"gain", required_argument, 0, 'g'},
|
|
{"hard", required_argument, 0, 'h'},
|
|
{"label", required_argument, 0, 'l'},
|
|
{"mbest", required_argument, 0, 'm'},
|
|
{"mag", required_argument, 0, 'i'},
|
|
{"pitchquant", required_argument, 0, 'o'},
|
|
{"pred", required_argument, 0, 'p'},
|
|
{"quant", required_argument, 0, 'q'},
|
|
{"stagevar", required_argument, 0, 's'},
|
|
{"uniform", required_argument, 0, 'u'},
|
|
{"verbose", no_argument, 0, 'v'},
|
|
{"uniform2", required_argument, 0, 'x'},
|
|
{"weight", no_argument, 0, 'w'},
|
|
{0, 0, 0, 0}
|
|
};
|
|
|
|
int opt_index = 0;
|
|
|
|
while ((c = getopt_long (argc, argv, "ad:q:vs:f:p:e:u:l:m:h:wg:o:ix:", long_options, &opt_index)) != -1) {
|
|
switch (c) {
|
|
case 'a':
|
|
/* small cpectral vectors - zero out several bands */
|
|
small_vec = 1;
|
|
break;
|
|
case 'f':
|
|
/* start VQ at band first+1 */
|
|
first = atoi(optarg);
|
|
k = NB_BANDS-first;
|
|
fprintf(stderr, "first = %d k = %d\n", first, k);
|
|
break;
|
|
case 's':
|
|
/* text file to dump error (variance) per stage */
|
|
lpcnet_fsv = fopen(optarg, "wt"); assert(lpcnet_fsv != NULL);
|
|
break;
|
|
case 'd':
|
|
dec = atoi(optarg);
|
|
fprintf(stderr, "dec = %d\n", dec);
|
|
break;
|
|
case 'e':
|
|
/* external pitch estimate, one F0 est (Hz) per line of text file */
|
|
fpitch = fopen(optarg, "rt"); assert(fpitch != NULL);
|
|
fprintf(stderr, "ext pitch F0 file: %s\n", optarg);
|
|
break;
|
|
case 'g':
|
|
pitch_gain_bias = atof(optarg);
|
|
fprintf(stderr, "pitch_gain bias: %f\n", pitch_gain_bias);
|
|
break;
|
|
case 'h':
|
|
/* hard limit (saturate) first feature (energy) */
|
|
lower_limit = atof(optarg);
|
|
fprintf(stderr, "lower_limit: %f upper_limit: %f\n", lower_limit, upper_limit);
|
|
break;
|
|
case 'i':
|
|
/* work in log mag rather than cepstral domain */
|
|
logmag = 1;
|
|
break;
|
|
case 'l':
|
|
/* text label to pront with results */
|
|
strcpy(label, optarg);
|
|
break;
|
|
case 'm':
|
|
mbest_survivors = atoi(optarg);
|
|
fprintf(stderr, "mbest_survivors = %d\n", mbest_survivors);
|
|
break;
|
|
case 'o':
|
|
pitch_bits = atoi(optarg);
|
|
fprintf(stderr, "pitch quantised to %d bits\n", pitch_bits);
|
|
break;
|
|
case 'p':
|
|
pred = atof(optarg);
|
|
fprintf(stderr, "pred = %f\n", pred);
|
|
break;
|
|
case 'q':
|
|
/* list of comma delimited file names */
|
|
strcpy(fnames, optarg);
|
|
p = fnames;
|
|
num_stages = 0;
|
|
do {
|
|
assert(num_stages < MAX_STAGES);
|
|
strcpy(fn, p);
|
|
comma = strchr(fn, ',');
|
|
if (comma) {
|
|
*comma = 0;
|
|
p = comma+1;
|
|
}
|
|
/* load quantiser file */
|
|
fprintf(stderr, "stage: %d loading %s ...", num_stages, fn);
|
|
fq=fopen(fn, "rb");
|
|
if (fq == NULL) {
|
|
fprintf(stderr, "Couldn't open: %s\n", fn);
|
|
exit(1);
|
|
}
|
|
/* count how many entries m of dimension k are in this VQ file */
|
|
m[num_stages] = 0;
|
|
while (fread(features, sizeof(float), k, fq) == (size_t)k)
|
|
m[num_stages]++;
|
|
assert(m[num_stages] <= MAX_ENTRIES);
|
|
fprintf(stderr, "%d entries of vectors width %d\n", m[num_stages], k);
|
|
/* now load VQ into memory */
|
|
rewind(fq);
|
|
int rd = fread(&vq[num_stages*k*MAX_ENTRIES], sizeof(float), m[num_stages]*k, fq);
|
|
assert(rd == m[num_stages]*k);
|
|
num_stages++;
|
|
fclose(fq);
|
|
} while(comma);
|
|
break;
|
|
case 'u':
|
|
uniform_step = atof(optarg);
|
|
fprintf(stderr, "uniform quant step size: %3.2f dB\n", uniform_step);
|
|
uniform_step2 = uniform_step;
|
|
break;
|
|
case 'x':
|
|
uniform_step2 = atof(optarg);
|
|
fprintf(stderr, "uniform quant step size 12..17: %3.2f dB\n", uniform_step2);
|
|
break;
|
|
case 'v':
|
|
lpcnet_verbose = 1;
|
|
break;
|
|
case 'w':
|
|
weight = 1.0/sqrt(NB_BANDS);
|
|
break;
|
|
default:
|
|
fprintf(stderr,"usage: %s [Options]:\n [-d --decimation 1/2/3...]\n [-q --quant quantfile1,quantfile2,....]\n", argv[0]);
|
|
fprintf(stderr," [-g --gain pitch gain bias]\n");
|
|
fprintf(stderr," [-h --hard lowerLimitdB\n");
|
|
fprintf(stderr," [-i --mag\n");
|
|
fprintf(stderr," [-l --label txtLabel]\n");
|
|
fprintf(stderr," [-m --mbest survivors]\n [-o --pitchbits nBits]\n");
|
|
fprintf(stderr," [-p --pred predCoff]\n [-f --first firstElement]\n [-s --stagevar TxtFile]\n");
|
|
fprintf(stderr," [-e --extpitch ExtPitchFile]\n [-u --uniform stepSizedB]\n [-v --verbose]\n");
|
|
fprintf(stderr," [-w --weight first cepstral by 1/sqrt(NB_BANDS)\n");
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
fprintf(stderr, "dec: %d pred: %3.2f num_stages: %d mbest: %d small: %d logmag: %d",
|
|
dec, pred, num_stages, mbest_survivors, small_vec, logmag);
|
|
fprintf(stderr, "\n");
|
|
|
|
/* delay line so we can pass some features (like pitch and voicing) through unmodified */
|
|
float features_prev[dec+1][NB_FEATURES];
|
|
/* adjacent vectors used for linear interpolation, note only 0..17 and 38,39 used */
|
|
float features_lin[2][NB_FEATURES];
|
|
|
|
for(d=0; d<dec+1; d++)
|
|
for(i=0; i<NB_FEATURES; i++)
|
|
features_prev[d][i] = 0.0;
|
|
for(d=0; d<2; d++)
|
|
for(i=0; i<NB_FEATURES; i++)
|
|
features_lin[d][i] = 0.0;
|
|
for(i=0; i<NB_FEATURES; i++)
|
|
features_quant[i] = 0.0;
|
|
|
|
fin = stdin;
|
|
fout = stdout;
|
|
|
|
/* dec == 2:
|
|
In.: f2 f3 f4 f5 f6
|
|
Out: f0 (f0+f2)/2) f2 (f2+f4)/2 f4 ....
|
|
|
|
features_prev
|
|
2 f2 f3 f4 f5 f6
|
|
1 f1 f2 f3 f4 f5
|
|
0 f0 f1 f2 f3 f4
|
|
|
|
features_lin
|
|
1 f2 f2 f4 f4 f6
|
|
0 f0 f0 f2 f2 f4
|
|
*/
|
|
|
|
/* dec == 3:
|
|
In.: f3 f4 f5 f6 f7
|
|
Out: .... f0 2f0/3 + f3/3 f0/3 + 2f2/3 f3 2f3/3 + f6/3
|
|
|
|
features_prev
|
|
3 f3 f4 f5 f6 f7
|
|
2 f2 f3 f4 f5 f6
|
|
1 f1 f2 f3 f4 f5
|
|
0 f0 f1 f2 f3 f4
|
|
|
|
features_lin
|
|
1 f3 f3 f3 f6 f6
|
|
0 f0 f0 f0 f3 f3
|
|
*/
|
|
|
|
long noutliers[NOUTLIERS];
|
|
for(i=0; i<NOUTLIERS; i++)
|
|
noutliers[i] = 0;
|
|
int qv = 0;
|
|
|
|
while(fread(features, sizeof(float), NB_FEATURES, fin) == NB_FEATURES) {
|
|
|
|
for(i=0; i<NB_FEATURES; i++) {
|
|
if (isnan(features[i])) {
|
|
fprintf(stderr, "f: %d i: %d\n", f, i);
|
|
}
|
|
}
|
|
|
|
/* optionally convert cepstrals to log magnitudes */
|
|
if (logmag) {
|
|
float tmp[NB_BANDS];
|
|
idct(tmp, features);
|
|
for(i=0; i<NB_BANDS; i++) features[i] = tmp[i];
|
|
}
|
|
|
|
/* convert cepstrals to dB */
|
|
for(i=0; i<NB_BANDS; i++)
|
|
features[i] *= 10.0;
|
|
|
|
/* optional weight on first cepstral which increases at
|
|
sqrt(NB_BANDS) for every dB of speech input power. Note by
|
|
doing it here, we won't be measuring SD of this step, SD
|
|
results will be on weighted vector. */
|
|
features[0] *= weight;
|
|
|
|
/* apply lower limit to features[0] */
|
|
|
|
if (features[0] < lower_limit) features[0] = lower_limit;
|
|
if (features[0] > upper_limit) features[0] = upper_limit;
|
|
|
|
/* optionally load external pitch est sample and replace pitch feature */
|
|
if (fpitch != NULL) {
|
|
float f0;
|
|
if (fscanf(fpitch,"%f\n", &f0)) {
|
|
float pitch_index = 2.0*Fs/f0;
|
|
features[2*NB_BANDS] = 0.01*(pitch_index-200.0);
|
|
//fprintf(stderr,"%d: %f %f %f\n", f, f0, pitch_index, features[2*NB_BANDS]);
|
|
}
|
|
else
|
|
fprintf(stderr, "f0 not read\n");
|
|
}
|
|
|
|
/* optionally pitch gain bias - but I would prefer a non-magic numbers approach */
|
|
features[2*NB_BANDS+1] += pitch_gain_bias;
|
|
|
|
/* maintain delay line of unquantised features for partial quantisation and distortion measure */
|
|
for(d=0; d<dec; d++)
|
|
for(i=0; i<NB_FEATURES; i++)
|
|
features_prev[d][i] = features_prev[d+1][i];
|
|
for(i=0; i<NB_FEATURES; i++)
|
|
features_prev[dec][i] = features[i];
|
|
|
|
// clear outpout features to make sure we are not cheating.
|
|
// Note we cant clear quant_out as we need memory of last
|
|
// frames output for pred quant
|
|
|
|
for(i=0; i<NB_FEATURES; i++)
|
|
features_out[i] = 0.0;
|
|
|
|
if ((f % dec) == 0) {
|
|
/* non-interpolated frame ----------------------------------------*/
|
|
|
|
/* optional quantisation */
|
|
if (num_stages || (uniform_step != 0.0)) {
|
|
if (num_stages) {
|
|
if (mbest_survivors) {
|
|
/* mbest predictive VQ */
|
|
quant_pred_mbest(&features_quant[first], indexes, &features[first], pred, num_stages, vq, m, k, mbest_survivors);
|
|
}
|
|
else {
|
|
/* standard predictive VQ */
|
|
quant_pred(&features_quant[first], &features[first], pred, num_stages, vq, m, k);
|
|
}
|
|
for(i=0; i<first; i++)
|
|
features_quant[i] = features[i];
|
|
}
|
|
if (uniform_step != 0.0) {
|
|
for(i=0; i<12; i++) {
|
|
features_quant[i] = uniform_step*round(features[i]/uniform_step);
|
|
}
|
|
for(; i<NB_BANDS; i++) {
|
|
features_quant[i] = uniform_step2*round(features[i]/uniform_step2);
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
/* unquantised */
|
|
for(i=0; i<NB_BANDS; i++) {
|
|
features_quant[i] = features[i];
|
|
}
|
|
}
|
|
|
|
if (pitch_bits) {
|
|
int ind = pitch_encode(features[2*NB_BANDS], pitch_bits);
|
|
features_quant[2*NB_BANDS] = pitch_decode(pitch_bits, ind);
|
|
ind = pitch_gain_encode(features[2*NB_BANDS+1]);
|
|
features_quant[2*NB_BANDS+1] = pitch_gain_decode(ind);
|
|
}
|
|
else {
|
|
features_quant[2*NB_BANDS] = features[2*NB_BANDS];
|
|
features_quant[2*NB_BANDS+1] = features[2*NB_BANDS+1]; /* pitch gain */
|
|
}
|
|
|
|
/* update linear interpolation arrays */
|
|
for(i=0; i<NB_FEATURES; i++) {
|
|
features_lin[0][i] = features_lin[1][i];
|
|
features_lin[1][i] = features_quant[i];
|
|
}
|
|
|
|
/* pass (quantised) frame through */
|
|
for(i=0; i<NB_BANDS; i++) {
|
|
features_out[i] = features_lin[0][i];
|
|
}
|
|
features_out[2*NB_BANDS] = features_lin[0][2*NB_BANDS];
|
|
features_out[2*NB_BANDS+1] = features_lin[0][2*NB_BANDS+1];
|
|
|
|
/* measure quantisation error power (variance). The
|
|
dec/interp also adds significant distortion however we
|
|
are just counting quantiser distortion here. */
|
|
|
|
float e = 0.0;
|
|
for(i=0; i<NB_BANDS; i++) {
|
|
e += pow(features_out[i]-features_prev[0][i], 2.0);
|
|
}
|
|
sum_sq_err += e; n+= NB_BANDS;
|
|
for (i=NOUTLIERS; i>=0; i--)
|
|
if (sqrt(e/NB_BANDS) > (float)(i+1.0)) {
|
|
noutliers[i]++;
|
|
break;
|
|
}
|
|
qv++;
|
|
|
|
|
|
} else {
|
|
/* interpolated frame ----------------------------------------*/
|
|
|
|
for(i=0; i<NB_FEATURES; i++)
|
|
features_out[i] = 0.0;
|
|
/* interpolate frame */
|
|
d = f%dec;
|
|
for(i=0; i<NB_FEATURES; i++) {
|
|
fract = (float)d/(float)dec;
|
|
features_out[i] = (1.0-fract)*features_lin[0][i] + fract*features_lin[1][i];
|
|
}
|
|
|
|
}
|
|
|
|
f++;
|
|
|
|
features_out[0] /= weight;
|
|
|
|
/* convert cespstrals back from dB */
|
|
for(i=0; i<NB_BANDS; i++)
|
|
features_out[i] *= 1/10.0;
|
|
|
|
/* optionally log magnitudes convert back to cepstrals */
|
|
if (logmag) {
|
|
float tmp[NB_BANDS];
|
|
dct(tmp, features_out);
|
|
for(i=0; i<NB_BANDS; i++) features_out[i] = tmp[i];
|
|
}
|
|
|
|
for(i=0; i<NB_FEATURES; i++) {
|
|
if (isnan(features_out[i])) {
|
|
fprintf(stderr, "f: %d i: %d\n", f, i);
|
|
exit(0);
|
|
}
|
|
}
|
|
|
|
if (small_vec) {
|
|
/* zero out unused cepstrals in small vec mode */
|
|
for(i=12; i<NB_BANDS; i++)
|
|
features_out[i] = 0.0;
|
|
}
|
|
|
|
fwrite(features_out, sizeof(float), NB_FEATURES, fout);
|
|
fflush(stdin);
|
|
fflush(stdout);
|
|
}
|
|
|
|
float var = sum_sq_err/n;
|
|
fprintf(stderr, "RESULTS %s var: %4.3f sd: %4.3f n: %4d", label, var, sqrt(var), n);
|
|
fprintf(stderr, " outliers > ");
|
|
for (i=0; i<NOUTLIERS; i++)
|
|
fprintf(stderr, "%d ", i+1);
|
|
fprintf(stderr, " dB = ");
|
|
for (i=0; i<NOUTLIERS; i++)
|
|
fprintf(stderr, "%5.4f ", (float)noutliers[i]/qv);
|
|
fprintf(stderr, "\n");
|
|
fclose(fin); fclose(fout); if (lpcnet_fsv != NULL) fclose(lpcnet_fsv); if(fpitch != NULL) fclose(fpitch);
|
|
}
|
|
|
|
|