rhubarb-lip-sync/lib/webrtc-8d2248ff/webrtc/tools/agc/activity_metric.cc

/*
 *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include <algorithm>
#include <memory>

#include "gflags/gflags.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "webrtc/modules/audio_processing/agc/agc.h"
#include "webrtc/modules/audio_processing/agc/loudness_histogram.h"
#include "webrtc/modules/audio_processing/agc/utility.h"
#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h"
#include "webrtc/modules/audio_processing/vad/common.h"
#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h"
#include "webrtc/modules/audio_processing/vad/standalone_vad.h"
#include "webrtc/modules/include/module_common_types.h"

static const int kAgcAnalWindowSamples = 100;
static const double kDefaultActivityThreshold = 0.3;

DEFINE_bool(standalone_vad, true, "enable stand-alone VAD");
DEFINE_string(true_vad, "", "name of a file containing true VAD in 'int'"
              " format");
DEFINE_string(video_vad, "", "name of a file containing video VAD (activity"
              " probabilities) in double format. One activity per 10ms is"
              " required. If no file is given the video information is not"
              " incorporated. Negative activity is interpreted as video is"
              " not adapted and the statistics are not computed during"
              " the learning phase. Note that the negative video activities"
              " are ONLY allowed at the beginning.");
DEFINE_string(result, "", "name of a file to write the results. The results"
              " will be appended to the end of the file. This is optional.");
DEFINE_string(audio_content, "", "name of a file where audio content is written"
              " to, in double format.");
DEFINE_double(activity_threshold, kDefaultActivityThreshold,
              "Activity threshold");

namespace webrtc {

// TODO(turajs) A new CL will be committed soon where ExtractFeatures will
// notify the caller of "silence" input, instead of bailing out. We would not
// need the following function when such a change is made.

// Add some dither to quiet frames. This avoids the ExtractFeatures skip a
// silence frame. Otherwise true VAD would drift with respect to the audio.
// We only consider mono inputs.
static void DitherSilence(AudioFrame* frame) {
  ASSERT_EQ(1u, frame->num_channels_);
  const double kRmsSilence = 5;
  const double sum_squared_silence = kRmsSilence * kRmsSilence *
      frame->samples_per_channel_;
  double sum_squared = 0;
  for (size_t n = 0; n < frame->samples_per_channel_; n++)
    sum_squared += frame->data_[n] * frame->data_[n];
  if (sum_squared <= sum_squared_silence) {
    for (size_t n = 0; n < frame->samples_per_channel_; n++)
      frame->data_[n] = (rand() & 0xF) - 8;  // NOLINT: ignore non-threadsafe.
  }
}

class AgcStat {
 public:
  AgcStat()
      : video_index_(0),
        activity_threshold_(kDefaultActivityThreshold),
        audio_content_(LoudnessHistogram::Create(kAgcAnalWindowSamples)),
        audio_processing_(new VadAudioProc()),
        vad_(new PitchBasedVad()),
        standalone_vad_(StandaloneVad::Create()),
        audio_content_fid_(NULL) {
    for (size_t n = 0; n < kMaxNumFrames; n++)
      video_vad_[n] = 0.5;
  }

  ~AgcStat() {
    if (audio_content_fid_ != NULL) {
      fclose(audio_content_fid_);
    }
  }

  void set_audio_content_file(FILE* audio_content_fid) {
    audio_content_fid_ = audio_content_fid;
  }

  int AddAudio(const AudioFrame& frame, double p_video,
               int* combined_vad) {
    if (frame.num_channels_ != 1 ||
        frame.samples_per_channel_ !=
            kSampleRateHz / 100 ||
            frame.sample_rate_hz_ != kSampleRateHz)
      return -1;
    video_vad_[video_index_++] = p_video;
    AudioFeatures features;
    audio_processing_->ExtractFeatures(
        frame.data_, frame.samples_per_channel_, &features);
    if (FLAGS_standalone_vad) {
      standalone_vad_->AddAudio(frame.data_,
                                frame.samples_per_channel_);
    }
    if (features.num_frames > 0) {
      double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5};
      if (FLAGS_standalone_vad) {
        standalone_vad_->GetActivity(p, kMaxNumFrames);
      }
      // TODO(turajs) combining and limiting are used in the source files as
      // well they can be moved to utility.
      // Combine Video and stand-alone VAD.
      for (size_t n = 0; n < features.num_frames; n++) {
        double p_active = p[n] * video_vad_[n];
        double p_passive = (1 - p[n]) * (1 - video_vad_[n]);
        p[n]  = p_active / (p_active + p_passive);
        // Limit probabilities.
        p[n] = std::min(std::max(p[n], 0.01), 0.99);
      }
      if (vad_->VoicingProbability(features, p) < 0)
        return -1;
      for (size_t n = 0; n < features.num_frames; n++) {
        audio_content_->Update(features.rms[n], p[n]);
        double ac = audio_content_->AudioContent();
        if (audio_content_fid_ != NULL) {
          fwrite(&ac, sizeof(ac), 1, audio_content_fid_);
        }
        if (ac > kAgcAnalWindowSamples * activity_threshold_) {
          combined_vad[n] = 1;
        } else {
          combined_vad[n] = 0;
        }
      }
      video_index_ = 0;
    }
    return static_cast<int>(features.num_frames);
  }

  void Reset() {
    audio_content_->Reset();
  }

  void SetActivityThreshold(double activity_threshold) {
    activity_threshold_ = activity_threshold;
  }

 private:
  int video_index_;
  double activity_threshold_;
  double video_vad_[kMaxNumFrames];
  std::unique_ptr<LoudnessHistogram> audio_content_;
  std::unique_ptr<VadAudioProc> audio_processing_;
  std::unique_ptr<PitchBasedVad> vad_;
  std::unique_ptr<StandaloneVad> standalone_vad_;

  FILE* audio_content_fid_;
};


void void_main(int argc, char* argv[]) {
  webrtc::AgcStat agc_stat;

  FILE* pcm_fid = fopen(argv[1], "rb");
  ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1];

  if (argc < 2) {
    fprintf(stderr, "\nNot Enough arguments\n");
  }

  FILE* true_vad_fid = NULL;
  ASSERT_GT(FLAGS_true_vad.size(), 0u) << "Specify the file containing true "
      "VADs using --true_vad flag.";
  true_vad_fid = fopen(FLAGS_true_vad.c_str(), "rb");
  ASSERT_TRUE(true_vad_fid != NULL) << "Cannot open the active list " <<
      FLAGS_true_vad;

  FILE* results_fid = NULL;
  if (FLAGS_result.size() > 0) {
    // True if this is the first time writing to this function and we add a
    // header to the beginning of the file.
    bool write_header;
    // Open in the read mode. If it fails, the file doesn't exist and has to
    // write a header for it. Otherwise no need to write a header.
    results_fid = fopen(FLAGS_result.c_str(), "r");
    if (results_fid == NULL) {
      write_header = true;
    } else {
      fclose(results_fid);
      write_header = false;
    }
    // Open in append mode.
    results_fid = fopen(FLAGS_result.c_str(), "a");
    ASSERT_TRUE(results_fid != NULL) << "Cannot open the file, " <<
              FLAGS_result << ", to write the results.";
    // Write the header if required.
    if (write_header) {
      fprintf(results_fid, "%% Total Active,  Misdetection,  "
              "Total inactive,  False Positive,  On-sets,  Missed segments,  "
              "Average response\n");
    }
  }

  FILE* video_vad_fid = NULL;
  if (FLAGS_video_vad.size() > 0) {
    video_vad_fid = fopen(FLAGS_video_vad.c_str(), "rb");
    ASSERT_TRUE(video_vad_fid != NULL) <<  "Cannot open the file, " <<
              FLAGS_video_vad << " to read video-based VAD decisions.\n";
  }

  // AgsStat will be the owner of this file and will close it at its
  // destructor.
  FILE* audio_content_fid = NULL;
  if (FLAGS_audio_content.size() > 0) {
    audio_content_fid = fopen(FLAGS_audio_content.c_str(), "wb");
    ASSERT_TRUE(audio_content_fid != NULL) << "Cannot open file, " <<
              FLAGS_audio_content << " to write audio-content.\n";
    agc_stat.set_audio_content_file(audio_content_fid);
  }

  webrtc::AudioFrame frame;
  frame.num_channels_ = 1;
  frame.sample_rate_hz_ = 16000;
  frame.samples_per_channel_ = frame.sample_rate_hz_ / 100;
  const size_t kSamplesToRead = frame.num_channels_ *
      frame.samples_per_channel_;

  agc_stat.SetActivityThreshold(FLAGS_activity_threshold);

  int ret_val = 0;
  int num_frames = 0;
  int agc_vad[kMaxNumFrames];
  uint8_t true_vad[kMaxNumFrames];
  double p_video = 0.5;
  int total_active = 0;
  int total_passive = 0;
  int total_false_positive = 0;
  int total_missed_detection = 0;
  int onset_adaptation = 0;
  int num_onsets = 0;
  bool onset = false;
  uint8_t previous_true_vad = 0;
  int num_not_adapted = 0;
  size_t true_vad_index = 0;
  bool in_false_positive_region = false;
  int total_false_positive_duration = 0;
  bool video_adapted = false;
  while (kSamplesToRead == fread(frame.data_, sizeof(int16_t),
                                 kSamplesToRead, pcm_fid)) {
    assert(true_vad_index < kMaxNumFrames);
    ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1,
                        true_vad_fid))
        << "Size mismatch between True-VAD and the PCM file.\n";
    if (video_vad_fid != NULL) {
      ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) <<
          "Not enough video-based VAD probabilities.";
    }

    // Negative video activity indicates that the video-based VAD is not yet
    // adapted. Disregards the learning phase in statistics.
    if (p_video < 0) {
      if (video_adapted) {
        fprintf(stderr, "Negative video probabilities ONLY allowed at the "
            "beginning of the sequence, not in the middle.\n");
        exit(1);
      }
      continue;
    } else {
      video_adapted = true;
    }

    num_frames++;
    uint8_t last_true_vad;
    if (true_vad_index == 0) {
      last_true_vad = previous_true_vad;
    } else {
      last_true_vad = true_vad[true_vad_index - 1];
    }
    if (last_true_vad == 1 && true_vad[true_vad_index] == 0) {
      agc_stat.Reset();
    }
    true_vad_index++;

    DitherSilence(&frame);

    ret_val = agc_stat.AddAudio(frame, p_video, agc_vad);
    ASSERT_GE(ret_val, 0);

    if (ret_val > 0) {
      ASSERT_EQ(true_vad_index, static_cast<size_t>(ret_val));
      for (int n = 0; n < ret_val; n++) {
        if (true_vad[n] == 1) {
          total_active++;
          if (previous_true_vad == 0) {
            num_onsets++;
            onset = true;
          }
          if (agc_vad[n] == 0) {
            total_missed_detection++;
            if (onset)
              onset_adaptation++;
          } else {
            in_false_positive_region = false;
            onset = false;
          }
        } else if (true_vad[n] == 0) {
          // Check if |on_set| flag is still up. If so it means that we totally
          // missed an active region
          if (onset)
            num_not_adapted++;
          onset = false;

          total_passive++;
          if (agc_vad[n] == 1) {
            total_false_positive++;
            in_false_positive_region = true;
          }
          if (in_false_positive_region) {
            total_false_positive_duration++;
          }
        } else {
          ASSERT_TRUE(false) << "Invalid value for true-VAD.\n";
        }
        previous_true_vad = true_vad[n];
      }
      true_vad_index = 0;
    }
  }

  if (results_fid != NULL) {
    fprintf(results_fid, "%4d  %4d  %4d  %4d  %4d  %4d  %4.0f %4.0f\n",
            total_active,
            total_missed_detection,
            total_passive,
            total_false_positive,
            num_onsets,
            num_not_adapted,
            static_cast<float>(onset_adaptation) / (num_onsets + 1e-12),
            static_cast<float>(total_false_positive_duration) /
            (total_passive + 1e-12));
  }
  fprintf(stdout, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n",
          total_active,
          total_missed_detection,
          total_passive,
          total_false_positive,
          num_onsets,
          num_not_adapted,
          static_cast<float>(onset_adaptation) / (num_onsets + 1e-12),
          static_cast<float>(total_false_positive_duration) /
              (total_passive + 1e-12));

  fclose(true_vad_fid);
  fclose(pcm_fid);
  if (video_vad_fid != NULL) {
    fclose(video_vad_fid);
  }
  if (results_fid != NULL) {
    fclose(results_fid);
  }
}

}  // namespace webrtc

int main(int argc, char* argv[]) {
  char kUsage[] =
      "\nCompute the number of misdetected and false-positive frames. Not\n"
      " that for each frame of audio (10 ms) there should be one true\n"
      " activity. If any video-based activity is given, there should also be\n"
      " one probability per frame.\n"
      "\nUsage:\n\n"
      "activity_metric input_pcm [options]\n"
      "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits "
      "format.\n\n";
  google::SetUsageMessage(kUsage);
  google::ParseCommandLineFlags(&argc, &argv, true);
  webrtc::void_main(argc, argv);
  return 0;
}