/* * vad.cpp * * This is a simple vad implementation. It isn't tuned and testing * has been somewhat limited... but it seems to work ok. All numbers * that are db are multiplied by 100 to keep them slightly more * accurate and easy to use. (so -7450 is -74.50db) This code use * three functions not provided as part of this file : log10_32, bqInit * and bqProcess. The log function calculates : * 100 * 10 * log(x) - 9333 * which sould be an approximation of the signal energy x 100. * bqInit initializes a biquad data structure and bqProcess applies a * a biquad to a signal buffer. A biquad is used by the vad to filter * out lower frequency background noise. * * This code operates under the assumption that it will get frames with * lengths which are a multiple of 5ms. This is relatively easy to * change since the only real dependency is the hangover count -- its in * frames now but could easily be changed to samples. * */ #define VAD_CPP #include "c_utils.h" #include "c_vad.h" #define LOCAL static #define DATA_FRAME_LENGTH (5*8) /* * x = 1.5sec * 1000ms/sec * 8 samples/1 ms * samples => +1dB every x samples */ #define VAD_NOISEFLOOR_CNT_INIT (int)(8*1500) #define VAD_SIGNALMAX_CNT_INIT (int)(8*1500) /* Power Thresholds */ #define VAD_NOISE_TH_BASE (float) 10.00 /* 10.00 dB Noise Threshold */ #define VAD_NOISE_FLOOR_INIT (float)-74.00 /* -74.00 dB Initial Noise Floor */ #define VAD_SIGNAL_MAX_INIT (float)-80.00 /* -80.00 dB Initial Noise Max */ #define VAD_NOISE_TH_MIN (float) 1.00 /* 1.00 dB Minimum Noise Threshold */ /* High Pass Filter for getting rid of background noise from * input signal before energy calculations */ /* Butter : */ #define vhpfB0 (S2byte) 14339 #define vhpfB1 (S2byte)-28678 #define vhpfB2 (S2byte) 14339 #define vhpfA1 (S2byte)-28422 #define vhpfA2 (S2byte) 12550 /* Number of samples of silence before we declare silence period */ /* #samples = 8 samples/ms * 500ms */ #define VAD_HANGOVER_CNT_INIT (int)(8*500) typedef enum { VadState_Silence = 0, VadState_Speech, VadState_Unknown } t_VadState; typedef struct _vad { boolean enabled; /* Saved STA between input frames */ U4byte sta; /* state == 1 if VOICE * state == 0 if SILENCE */ t_VadState state; /* Countdown of consecutive frames before we declare silence */ int hangoverCnt; /* Threshold above which a signal is considered to be speech */ float noiseTH; /* Countdown after which the noise floor is * incremented by 1dB */ int noiseFloorCnt; /* Noise floor in dB */ float noiseFloor; /* Countdown after which the signal max is * decremented by 1dB */ int signalMaxCnt; /* Signal max in dB */ float signalMax; /* STARise == 1 if sta is rising * STARise == 0 if sta is falling */ int STARise; int stateTxCount; /* High Pass Filter for input signal */ t_biquad *bq; } t_vad; /* LOCAL */ LOCAL t_biquad vadbq; LOCAL t_vad vadd; void vadInit() { vadd.enabled = TRUE; vadd.bq = &vadbq; vadd.sta = 10000; vadd.noiseTH = VAD_NOISE_TH_BASE; vadd.state = VadState_Unknown; vadd.noiseFloorCnt = VAD_NOISEFLOOR_CNT_INIT; vadd.noiseFloor = VAD_NOISE_FLOOR_INIT; vadd.hangoverCnt = VAD_HANGOVER_CNT_INIT; vadd.STARise = 1; vadd.stateTxCount = 0; vadd.signalMax = VAD_SIGNAL_MAX_INIT; vadd.signalMaxCnt = VAD_SIGNALMAX_CNT_INIT; bqInit(vadd.bq, vhpfB0, vhpfB1, vhpfB2, vhpfA1, vhpfA2); } LOCAL U4byte computeSTA(S2byte *pdata, int length, U4byte *minSta) { int i; S4byte acc0,acc1; U4byte maxSta; *minSta = vadd.sta; maxSta = vadd.sta; for (i = 0; i < length; i++) { /* q.15 * q.15 = q.30 */ acc1 = pdata[i] * pdata[i]; if ( vadd.STARise ) { acc0 = -1 * (S4byte)(vadd.sta >> 6); acc1 = acc1 >> 5; } else { acc0 = -1 * (S4byte)(vadd.sta >> 9); acc1 = acc1 >> 8; } /* if */ acc0 += acc1; vadd.STARise = ( 0 >= acc0 ) ? 0 : 1; vadd.sta += acc0; if ( vadd.sta > maxSta ) { maxSta = vadd.sta; // arijit - i added the cast } else if ( vadd.sta < *minSta ) { *minSta = vadd.sta; } } /* for */ return maxSta; } LOCAL void computeNFE(float minpower, float maxpower, int length) { if ( minpower <= vadd.noiseFloor ) { vadd.noiseFloor = minpower; vadd.noiseFloorCnt = VAD_NOISEFLOOR_CNT_INIT; } else { if ( vadd.noiseFloorCnt < length ) { vadd.noiseFloor += 1; vadd.noiseFloorCnt = (VAD_NOISEFLOOR_CNT_INIT + vadd.noiseFloorCnt - length); } else { vadd.noiseFloorCnt -= length; } } } unsigned long stopCount = 32000; LOCAL boolean vadSubProcess(S2byte *data, int length) { boolean SpeechDetected; boolean FrameSpeechFlag; S2byte tmpData[DATA_FRAME_LENGTH]; U4byte sta[2]; float power[2]; static unsigned long count = 0; SpeechDetected = TRUE; FrameSpeechFlag = FALSE; bqProcess(vadd.bq, data, tmpData, length); sta[1] = computeSTA(tmpData, length, &sta[0]); calcPower(2, sta, power); computeNFE(power[0], power[1], length); count += length; if (count >= stopCount) { count = 0; } if (power[1] > (vadd.noiseFloor + vadd.noiseTH)) { FrameSpeechFlag = TRUE; } if ( FrameSpeechFlag == FALSE) { if ( vadd.hangoverCnt < length ) { SpeechDetected = FALSE; vadd.hangoverCnt = 0; if ( vadd.state != VadState_Silence ) { vadd.stateTxCount++; } vadd.state = VadState_Silence; } else { vadd.hangoverCnt -= length; } } else { vadd.hangoverCnt = VAD_HANGOVER_CNT_INIT; if ( vadd.state == VadState_Silence ) { vadd.stateTxCount++; } vadd.state = VadState_Speech; } return SpeechDetected; } /* * Returns: true for speech * false for silence */ boolean vadProcess(S2byte *data, int length) { /* vadProcess locals */ int idx; int step; boolean ret; ret = FALSE; if ( vadd.enabled == TRUE ) { /* Cut up the frame into 5ms chunks for processing purposes */ for (idx = 0; length > 0; length -= step) { step = (length < DATA_FRAME_LENGTH) ? length : DATA_FRAME_LENGTH; ret |= vadSubProcess(&data[idx], step); idx += step; } } else { ret = TRUE; } return ret; }