From 086ee67d839b33bf475177f680fcc848a0625266 Mon Sep 17 00:00:00 2001 From: Pieter Wuille Date: Fri, 27 Nov 2015 13:20:29 +0100 Subject: Switch to a more efficient rolling Bloom filter For each 'bit' in the filter we really maintain 2 bits, which store either: 0: not set 1-3: set in generation N After (nElements / 2) insertions, we switch to a new generation, and wipe entries which already had the new generation number, effectively switching from the last 1.5 * nElements set to the last 1.0 * nElements set. This is 25% more space efficient than the previous implementation, and can (at peak) store 1.5 times the requested amount of history (though only 1.0 times the requested history is guaranteed). The existing unit tests should be sufficient. --- src/bloom.cpp | 77 ++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 24 deletions(-) (limited to 'src/bloom.cpp') diff --git a/src/bloom.cpp b/src/bloom.cpp index de8720659..4bda2bbce 100644 --- a/src/bloom.cpp +++ b/src/bloom.cpp @@ -216,30 +216,54 @@ void CBloomFilter::UpdateEmptyFull() isEmpty = empty; } -CRollingBloomFilter::CRollingBloomFilter(unsigned int nElements, double fpRate) : - b1(nElements * 2, fpRate, 0), b2(nElements * 2, fpRate, 0) +CRollingBloomFilter::CRollingBloomFilter(unsigned int nElements, double fpRate) { - // Implemented using two bloom filters of 2 * nElements each. - // We fill them up, and clear them, staggered, every nElements - // inserted, so at least one always contains the last nElements - // inserted. - nInsertions = 0; - nBloomSize = nElements * 2; - + double logFpRate = log(fpRate); + /* The optimal number of hash functions is log(fpRate) / log(0.5), but + * restrict it to the range 1-50. */ + nHashFuncs = std::max(1, std::min((int)round(logFpRate / log(0.5)), 50)); + /* In this rolling bloom filter, we'll store between 2 and 3 generations of nElements / 2 entries. */ + nEntriesPerGeneration = (nElements + 1) / 2; + uint32_t nMaxElements = nEntriesPerGeneration * 3; + /* The maximum fpRate = pow(1.0 - exp(-nHashFuncs * nMaxElements / nFilterBits), nHashFuncs) + * => pow(fpRate, 1.0 / nHashFuncs) = 1.0 - exp(-nHashFuncs * nMaxElements / nFilterBits) + * => 1.0 - pow(fpRate, 1.0 / nHashFuncs) = exp(-nHashFuncs * nMaxElements / nFilterBits) + * => log(1.0 - pow(fpRate, 1.0 / nHashFuncs)) = -nHashFuncs * nMaxElements / nFilterBits + * => nFilterBits = -nHashFuncs * nMaxElements / log(1.0 - pow(fpRate, 1.0 / nHashFuncs)) + * => nFilterBits = -nHashFuncs * nMaxElements / log(1.0 - exp(logFpRate / nHashFuncs)) + */ + uint32_t nFilterBits = (uint32_t)ceil(-1.0 * nHashFuncs * nMaxElements / log(1.0 - exp(logFpRate / nHashFuncs))); + data.clear(); + /* We store up to 16 'bits' per data element. */ + data.resize((nFilterBits + 15) / 16); reset(); } +/* Similar to CBloomFilter::Hash */ +inline unsigned int CRollingBloomFilter::Hash(unsigned int nHashNum, const std::vector& vDataToHash) const { + return MurmurHash3(nHashNum * 0xFBA4C795 + nTweak, vDataToHash) % (data.size() * 16); +} + void CRollingBloomFilter::insert(const std::vector& vKey) { - if (nInsertions == 0) { - b1.clear(); - } else if (nInsertions == nBloomSize / 2) { - b2.clear(); + if (nEntriesThisGeneration == nEntriesPerGeneration) { + nEntriesThisGeneration = 0; + nGeneration++; + if (nGeneration == 4) { + nGeneration = 1; + } + /* Wipe old entries that used this generation number. */ + for (uint32_t p = 0; p < data.size() * 16; p++) { + if (get(p) == nGeneration) { + put(p, 0); + } + } } - b1.insert(vKey); - b2.insert(vKey); - if (++nInsertions == nBloomSize) { - nInsertions = 0; + nEntriesThisGeneration++; + + for (int n = 0; n < nHashFuncs; n++) { + uint32_t h = Hash(n, vKey); + put(h, nGeneration); } } @@ -251,10 +275,13 @@ void CRollingBloomFilter::insert(const uint256& hash) bool CRollingBloomFilter::contains(const std::vector& vKey) const { - if (nInsertions < nBloomSize / 2) { - return b2.contains(vKey); + for (int n = 0; n < nHashFuncs; n++) { + uint32_t h = Hash(n, vKey); + if (get(h) == 0) { + return false; + } } - return b1.contains(vKey); + return true; } bool CRollingBloomFilter::contains(const uint256& hash) const @@ -265,8 +292,10 @@ bool CRollingBloomFilter::contains(const uint256& hash) const void CRollingBloomFilter::reset() { - unsigned int nNewTweak = GetRand(std::numeric_limits::max()); - b1.reset(nNewTweak); - b2.reset(nNewTweak); - nInsertions = 0; + nTweak = GetRand(std::numeric_limits::max()); + nEntriesThisGeneration = 0; + nGeneration = 1; + for (std::vector::iterator it = data.begin(); it != data.end(); it++) { + *it = 0; + } } -- cgit v1.2.3