void Main() { var filterSize = 8192; var words = 5000; var streamLength = 200000; var filter = new BloomFilter(filterSize); var source = new List(Words.Take(words)); var bloomCount = 0; var realCount = 0; foreach (var word in source) filter.Add(word); var bloomTime = Stopwatch.StartNew(); foreach (var word in Text.Take(streamLength)) { if (filter.Contains(word)) bloomCount++; } bloomTime.Stop(); var realTime = Stopwatch.StartNew(); foreach (var word in Text.Take(streamLength)) { if (source.Contains(word)) realCount++; } realTime.Stop(); Console.WriteLine("False positives: {0}.", (bloomCount - realCount)); Console.WriteLine("Time exact: {0}ms.", realTime.ElapsedMilliseconds); Console.WriteLine(); Console.WriteLine("Passed filter: {0}.", bloomCount); Console.WriteLine("Time estimate: {0}ms.", bloomTime.ElapsedMilliseconds); } String basePath = @"C:\Users\Florian\Dropbox\Studienstiftung\Academy\Code\"; IEnumerable Words { get { using (var fs = File.Open(basePath + "words.txt", FileMode.Open)) { using (var reader = new StreamReader(fs)) { while (!reader.EndOfStream) yield return reader.ReadLine(); } } } } IEnumerable Text { get { using (var fs = File.Open(basePath + "text.txt", FileMode.Open)) { using (var reader = new StreamReader(fs)) { while (!reader.EndOfStream) { var line = reader.ReadLine(); var words = line.Split(' '); foreach (var word in words) yield return word; } } } } } public class BloomFilter { readonly Int32 hashFunctionCount; readonly BitArray hashBits; readonly Func getHashSecondary; public BloomFilter(Int32 capacity) : this(capacity, null) { } public BloomFilter(Int32 capacity, Func hashFunction) : this(capacity, BestErrorRate(capacity), hashFunction) { } public BloomFilter(Int32 capacity, Single errorRate, Func hashFunction) : this(capacity, errorRate, hashFunction, BestN(capacity, errorRate), BestK(capacity, errorRate)) { } public BloomFilter(Int32 capacity, Single errorRate, Func hashFunction, Int32 n, Int32 k) { if (capacity < 1) throw new ArgumentOutOfRangeException("capacity", capacity, "capacity must be > 0"); if (errorRate >= 1 || errorRate <= 0) throw new ArgumentOutOfRangeException("errorRate", errorRate, String.Format("errorRate must be between 0 and 1, exclusive. Was {0}", errorRate)); if (n < 1) // from overflow in bestM calculation throw new ArgumentOutOfRangeException(String.Format("The provided capacity and errorRate values would result in an array of length > int.MaxValue. Please reduce either of these values. Capacity: {0}, Error rate: {1}", capacity, errorRate)); if (hashFunction == null) { if (typeof(T) == typeof(String)) getHashSecondary = HashString; else if (typeof(T) == typeof(Int32)) getHashSecondary = HashInt32; else throw new ArgumentNullException("hashFunction", "Please provide a hash function for your type T, when T is not a string or int."); } else getHashSecondary = hashFunction; hashFunctionCount = k; hashBits = new BitArray(n); } public void Add(T item) { // start flipping bits for each hash of item var primaryHash = item.GetHashCode(); var secondaryHash = getHashSecondary(item); for (int i = 0; i < hashFunctionCount; i++) { var hash = ComputeHash(primaryHash, secondaryHash, i); hashBits[hash] = true; } } public Boolean Contains(T item) { var primaryHash = item.GetHashCode(); var secondaryHash = getHashSecondary(item); for (int i = 0; i < hashFunctionCount; i++) { var hash = ComputeHash(primaryHash, secondaryHash, i); if (hashBits[hash] == false) return false; } return true; } public Double Truthiness { get { return (Double)TrueBits / hashBits.Count; } } Int32 TrueBits { get { int output = 0; foreach (bool bit in hashBits) { if (bit) output++; } return output; } } Int32 ComputeHash(Int32 primaryHash, Int32 secondaryHash, Int32 i) { var resultingHash = (primaryHash + (i * secondaryHash)) % hashBits.Count; return Math.Abs(resultingHash); } static Int32 BestK(Int32 capacity, Single errorRate) { return (Int32)Math.Round(Math.Log(2.0) * BestN(capacity, errorRate) / capacity); } static Int32 BestN(Int32 capacity, Single errorRate) { return (Int32)Math.Ceiling(capacity * Math.Log(errorRate, (1.0 / Math.Pow(2, Math.Log(2.0))))); } static Single BestErrorRate(Int32 capacity) { var c = 1f / capacity; if (c != 0) return c; // http://www.cs.princeton.edu/courses/archive/spring02/cs493/lec7.pdf return (Single)Math.Pow(0.6185, (Double)Int32.MaxValue / capacity); } static Int32 HashInt32(T input) { var x = (input as Int32?).Value; unchecked { x = ~x + (x << 15); // x = (x << 15) - x- 1, as (~x) + y is equivalent to y - x - 1 in two's complement representation x = x ^ (x >> 12); x = x + (x << 2); x = x ^ (x >> 4); x = x * 2057; // x = (x + (x << 3)) + (x << 11); x = x ^ (x >> 16); return (int)x; } } static Int32 HashString(T input) { var s = input as String; int hash = 0; for (int i = 0; i < s.Length; i++) { hash += s[i]; hash += (hash << 10); hash ^= (hash >> 6); } hash += (hash << 3); hash ^= (hash >> 11); hash += (hash << 15); return hash; } }