@@ -36,8 +36,6 @@ namespace {
3636const core::TPersistenceTag PRIOR_TAG{" a" , " prior" };
3737const core::TPersistenceTag CLASS_LABEL_TAG{" b" , " class_label" };
3838const core::TPersistenceTag CLASS_MODEL_TAG{" c" , " class_model" };
39- const core::TPersistenceTag MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG{
40- " d" , " min_max_likelihood_to_use_feature" };
4139const core::TPersistenceTag COUNT_TAG{" e" , " count" };
4240const core::TPersistenceTag CONDITIONAL_DENSITY_FROM_PRIOR_TAG{" f" , " conditional_density_from_prior" };
4341}
@@ -135,24 +133,26 @@ std::string CNaiveBayesFeatureDensityFromPrior::print() const {
135133 return result;
136134}
137135
138- CNaiveBayes::CNaiveBayes (const CNaiveBayesFeatureDensity& exemplar,
139- double decayRate,
140- TOptionalDouble minMaxLogLikelihoodToUseFeature)
141- : m_MinMaxLogLikelihoodToUseFeature{minMaxLogLikelihoodToUseFeature},
142- m_DecayRate{decayRate}, m_Exemplar{exemplar.clone ()}, m_ClassConditionalDensities{2 } {
136+ CNaiveBayes::CNaiveBayes (const CNaiveBayesFeatureDensity& exemplar, double decayRate)
137+ : m_DecayRate{decayRate}, m_Exemplar{exemplar.clone ()}, m_ClassConditionalDensities{2 } {
143138}
144139
145140CNaiveBayes::CNaiveBayes (const CNaiveBayesFeatureDensity& exemplar,
146141 const SDistributionRestoreParams& params,
147142 core::CStateRestoreTraverser& traverser)
148143 : m_DecayRate{params.s_DecayRate }, m_Exemplar{exemplar.clone ()}, m_ClassConditionalDensities{2 } {
149- traverser.traverseSubLevel (std::bind (&CNaiveBayes::acceptRestoreTraverser, this ,
150- std::cref (params), std::placeholders::_1));
144+ // If we persist before we create class conditional distributions we will
145+ // not have anything to restore and hasSubLevel will be false. Trying to
146+ // restore sets the traverser state to bad so we need to handle explicitly.
147+ if (traverser.hasSubLevel () && traverser.traverseSubLevel ([&](auto & traverser_) {
148+ return this ->acceptRestoreTraverser (params, traverser_);
149+ }) == false ) {
150+ traverser.setBadState ();
151+ }
151152}
152153
153154CNaiveBayes::CNaiveBayes (const CNaiveBayes& other)
154- : m_MinMaxLogLikelihoodToUseFeature{other.m_MinMaxLogLikelihoodToUseFeature },
155- m_DecayRate{other.m_DecayRate }, m_Exemplar{other.m_Exemplar ->clone ()} {
155+ : m_DecayRate{other.m_DecayRate }, m_Exemplar{other.m_Exemplar ->clone ()} {
156156 for (const auto & class_ : other.m_ClassConditionalDensities ) {
157157 m_ClassConditionalDensities.emplace (class_.first , class_.second );
158158 }
@@ -170,9 +170,6 @@ bool CNaiveBayes::acceptRestoreTraverser(const SDistributionRestoreParams& param
170170 std::ref (class_), std::cref (params),
171171 std::placeholders::_1)),
172172 m_ClassConditionalDensities.emplace (label, std::move (class_)))
173- RESTORE_SETUP_TEARDOWN (MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG, double value,
174- core::CStringUtils::stringToType (traverser.value (), value),
175- m_MinMaxLogLikelihoodToUseFeature.reset (value))
176173 } while (traverser.next ());
177174 return true ;
178175}
@@ -195,12 +192,6 @@ void CNaiveBayes::acceptPersistInserter(core::CStatePersistInserter& inserter) c
195192 std::ref (class_->second ),
196193 std::placeholders::_1));
197194 }
198-
199- if (m_MinMaxLogLikelihoodToUseFeature) {
200- inserter.insertValue (MIN_MAX_LOG_LIKELIHOOD_TO_USE_FEATURE_TAG,
201- *m_MinMaxLogLikelihoodToUseFeature,
202- core::CIEEE754::E_SinglePrecision);
203- }
204195}
205196
206197CNaiveBayes& CNaiveBayes::operator =(const CNaiveBayes& other) {
@@ -215,26 +206,29 @@ void CNaiveBayes::swap(CNaiveBayes& other) {
215206 std::swap (m_DecayRate, other.m_DecayRate );
216207 m_Exemplar.swap (other.m_Exemplar );
217208 m_ClassConditionalDensities.swap (other.m_ClassConditionalDensities );
218- std::swap (m_MinMaxLogLikelihoodToUseFeature, other.m_MinMaxLogLikelihoodToUseFeature );
219209}
220210
221211bool CNaiveBayes::initialized () const {
222- return m_ClassConditionalDensities.size () > 0 &&
212+ return m_ClassConditionalDensities.empty () == false &&
223213 std::all_of (m_ClassConditionalDensities.begin (),
224214 m_ClassConditionalDensities.end (),
225215 [](const std::pair<std::size_t , CClass>& class_) {
226216 return class_.second .initialized ();
227217 });
228218}
229219
220+ std::size_t CNaiveBayes::numberClasses () const {
221+ return m_ClassConditionalDensities.size ();
222+ }
223+
230224void CNaiveBayes::initialClassCounts (const TDoubleSizePrVec& counts) {
231225 for (const auto & count : counts) {
232226 m_ClassConditionalDensities.emplace (count.second , CClass{count.first });
233227 }
234228}
235229
236230void CNaiveBayes::addTrainingDataPoint (std::size_t label, const TDouble1VecVec& x) {
237- if (! this ->validate (x)) {
231+ if (this ->validate (x) == false ) {
238232 return ;
239233 }
240234
@@ -249,7 +243,7 @@ void CNaiveBayes::addTrainingDataPoint(std::size_t label, const TDouble1VecVec&
249243
250244 bool updateCount{false };
251245 for (std::size_t i = 0 ; i < x.size (); ++i) {
252- if (x[i].size () > 0 ) {
246+ if (x[i].empty () == false ) {
253247 class_.conditionalDensities ()[i]->add (x[i]);
254248 updateCount = true ;
255249 }
@@ -280,62 +274,74 @@ void CNaiveBayes::propagateForwardsByTime(double time) {
280274 }
281275}
282276
283- CNaiveBayes::TDoubleSizePrVec
284- CNaiveBayes::highestClassProbabilities (std::size_t n, const TDouble1VecVec& x) const {
285- TDoubleSizePrVec p (this ->classProbabilities (x));
277+ CNaiveBayes::TDoubleSizePrVecDoublePr
278+ CNaiveBayes::highestClassProbabilities (std::size_t n,
279+ const TDouble1VecVec& x,
280+ const TFeatureWeightProvider& weightProvider) const {
281+ auto [p, minFeatureWeight] = this ->classProbabilities (x, weightProvider);
286282 n = std::min (n, p.size ());
287- std::sort (p.begin (), p.begin () + n, std::greater<TDoubleSizePr >());
288- return TDoubleSizePrVec{p.begin (), p.begin () + n};
283+ std::sort (p.begin (), p.begin () + n, std::greater<>());
284+ return { TDoubleSizePrVec{p.begin (), p.begin () + n}, minFeatureWeight };
289285}
290286
291- double CNaiveBayes::classProbability (std::size_t label, const TDouble1VecVec& x) const {
292- TDoubleSizePrVec p (this ->classProbabilities (x));
287+ CNaiveBayes::TDoubleDoublePr
288+ CNaiveBayes::classProbability (std::size_t label,
289+ const TDouble1VecVec& x,
290+ const TFeatureWeightProvider& weightProvider) const {
291+ auto [p, minFeatureWeight] = this ->classProbabilities (x, weightProvider);
293292 auto i = std::find_if (p.begin (), p.end (), [label](const TDoubleSizePr& p_) {
294293 return p_.second == label;
295294 });
296- return i == p.end () ? 0.0 : i->first ;
295+ return { i == p.end () ? 0.0 : i->first , minFeatureWeight} ;
297296}
298297
299- CNaiveBayes::TDoubleSizePrVec CNaiveBayes::classProbabilities (const TDouble1VecVec& x) const {
300- if (!this ->validate (x)) {
301- return {};
298+ CNaiveBayes::TDoubleSizePrVecDoublePr
299+ CNaiveBayes::classProbabilities (const TDouble1VecVec& x,
300+ const TFeatureWeightProvider& weightProvider) const {
301+ if (this ->validate (x) == false ) {
302+ return {{}, 0.0 };
302303 }
303304 if (m_ClassConditionalDensities.empty ()) {
304305 LOG_ERROR (<< " Trying to compute class probabilities without supplying training data" );
305- return {};
306+ return {{}, 0.0 };
306307 }
307308
308309 using TDoubleVec = std::vector<double >;
309- using TMaxAccumulator = CBasicStatistics::SMax<double >::TAccumulator;
310310
311311 TDoubleSizePrVec p;
312312 p.reserve (m_ClassConditionalDensities.size ());
313313 for (const auto & class_ : m_ClassConditionalDensities) {
314314 p.emplace_back (CTools::fastLog (class_.second .count ()), class_.first );
315315 }
316+ double minFeatureWeight{1.0 };
316317
317318 TDoubleVec logLikelihoods;
318319 for (std::size_t i = 0 ; i < x.size (); ++i) {
319- if (x[i].size () > 0 ) {
320- TMaxAccumulator maxLogLikelihood ;
320+ if (x[i].empty () == false ) {
321+ auto & featureWeight = weightProvider () ;
321322 logLikelihoods.clear ();
322323 for (const auto & class_ : m_ClassConditionalDensities) {
323324 const auto & density = class_.second .conditionalDensities ()[i];
324325 double logLikelihood{density->logValue (x[i])};
325326 double logMaximumLikelihood{density->logMaximumValue ()};
326- maxLogLikelihood.add (logLikelihood - logMaximumLikelihood);
327327 logLikelihoods.push_back (logLikelihood);
328+ featureWeight.add (class_.first , logLikelihood - logMaximumLikelihood);
328329 }
329- double weight{1.0 };
330- if (m_MinMaxLogLikelihoodToUseFeature) {
331- weight = CTools::logisticFunction (
332- (maxLogLikelihood[0 ] - *m_MinMaxLogLikelihoodToUseFeature) /
333- std::fabs (*m_MinMaxLogLikelihoodToUseFeature),
334- 0.1 );
335- }
330+
331+ // We compute the class c_i probability using
332+ //
333+ // p(c_i | x) = exp(sum_i{w_j * log(L(x_j | c_i))}) / Z * p(c_i).
334+ //
335+ // Any feature whose weight < 1 has its significance dropped in class
336+ // selection, effectively we use the w_i'th root of the log-likelihood
337+ // which tends to 1 for all values if w_i is small enough. This can be
338+ // used to ignore features that for which x is the extreme tails of the
339+ // class conditional distribution.
340+ double featureWeight_{featureWeight.calculate ()};
336341 for (std::size_t j = 0 ; j < logLikelihoods.size (); ++j) {
337- p[j].first += weight * logLikelihoods[j];
342+ p[j].first += featureWeight_ * logLikelihoods[j];
338343 }
344+ minFeatureWeight = std::min (minFeatureWeight, featureWeight_);
339345 }
340346 }
341347
@@ -349,7 +355,7 @@ CNaiveBayes::TDoubleSizePrVec CNaiveBayes::classProbabilities(const TDouble1VecV
349355 pc.first /= Z;
350356 }
351357
352- return p ;
358+ return { std::move (p), minFeatureWeight} ;
353359}
354360
355361void CNaiveBayes::debugMemoryUsage (const core::CMemoryUsage::TMemoryUsagePtr& mem) const {
@@ -363,8 +369,7 @@ std::size_t CNaiveBayes::memoryUsage() const {
363369 core::CMemory::dynamicSize (m_ClassConditionalDensities);
364370}
365371
366- uint64_t CNaiveBayes::checksum (uint64_t seed) const {
367- CChecksum::calculate (seed, m_MinMaxLogLikelihoodToUseFeature);
372+ std::uint64_t CNaiveBayes::checksum (std::uint64_t seed) const {
368373 CChecksum::calculate (seed, m_DecayRate);
369374 CChecksum::calculate (seed, m_Exemplar);
370375 return CChecksum::calculate (seed, m_ClassConditionalDensities);
@@ -386,7 +391,7 @@ std::string CNaiveBayes::print() const {
386391bool CNaiveBayes::validate (const TDouble1VecVec& x) const {
387392 auto class_ = m_ClassConditionalDensities.begin ();
388393 if (class_ != m_ClassConditionalDensities.end () &&
389- class_->second .conditionalDensities ().size () > 0 &&
394+ class_->second .conditionalDensities ().empty () == false &&
390395 class_->second .conditionalDensities ().size () != x.size ()) {
391396 LOG_ERROR (<< " Unexpected feature vector: " << core::CContainerPrinter::print (x));
392397 return false ;
@@ -423,7 +428,7 @@ bool CNaiveBayes::CClass::acceptRestoreTraverser(const SDistributionRestoreParam
423428void CNaiveBayes::CClass::acceptPersistInserter (core::CStatePersistInserter& inserter) const {
424429 inserter.insertValue (COUNT_TAG, m_Count, core::CIEEE754::E_SinglePrecision);
425430 for (const auto & density : m_ConditionalDensities) {
426- if (dynamic_cast <const CNaiveBayesFeatureDensityFromPrior*>(density.get ())) {
431+ if (dynamic_cast <const CNaiveBayesFeatureDensityFromPrior*>(density.get ()) != nullptr ) {
427432 inserter.insertLevel (CONDITIONAL_DENSITY_FROM_PRIOR_TAG,
428433 std::bind (&CNaiveBayesFeatureDensity::acceptPersistInserter,
429434 density.get (), std::placeholders::_1));
0 commit comments