LogLog and HyperLogLog algorithms for high power counting

Where can I find the correct implementation of the LogLog algorithm ? They tried to implement it on their own, but my implementation project gives strange results.

Here it is:

function LogLog(max_error, max_count) { function log2(x) { return Math.log(x) / Math.LN2; } var m = 1.30 / max_error; var k = Math.ceil(log2(m * m)); m = Math.pow(2, k); var k_comp = 32 - k; var l = log2(log2(max_count / m)); if (isNaN(l)) l = 1; else l = Math.ceil(l); var l_mask = ((1 << l) - 1) >>> 0; var M = []; for (var i = 0; i < m; ++i) M[i] = 0; function count(hash) { if (hash !== undefined) { var j = hash >>> k_comp; var rank = 0; for (var i = 0; i < k_comp; ++i) { if ((hash >>> i) & 1) { rank = i + 1; break; } } M[j] = Math.max(M[j], rank & l_mask); } else { var c = 0; for (var i = 0; i < m; ++i) c += M[i]; return 0.79402 * m * Math.pow(2, c / m); } } return {count: count}; } function fnv1a(text) { var hash = 2166136261; for (var i = 0; i < text.length; ++i) { hash ^= text.charCodeAt(i); hash += (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); } return hash >>> 0; } var words = ['aardvark', 'abyssinian', ... ,'zoology']; // about 2 300 words var log_log = LogLog(0.01, 100000); for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i])); alert(log_log.count()); 

For some unknown reason, the implementation is very sensitive to the max_error parameter; it is the main factor that determines the value of the result. I'm sure there is some kind of stupid mistake :)

UPDATE: This problem is being solved in a newer version of the algorithm. I will publish it later.

+58
javascript algorithm counting hyperloglog
May 13 '11 at 10:40
source share
6 answers

Here is the updated version of the algorithm based on newer paper :

 var pow_2_32 = 0xFFFFFFFF + 1; function HyperLogLog(std_error) { function log2(x) { return Math.log(x) / Math.LN2; } function rank(hash, max) { var r = 1; while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; } return r; } var m = 1.04 / std_error; var k = Math.ceil(log2(m * m)), k_comp = 32 - k; m = Math.pow(2, k); var alpha_m = m == 16 ? 0.673 : m == 32 ? 0.697 : m == 64 ? 0.709 : 0.7213 / (1 + 1.079 / m); var M = []; for (var i = 0; i < m; ++i) M[i] = 0; function count(hash) { if (hash !== undefined) { var j = hash >>> k_comp; M[j] = Math.max(M[j], rank(hash, k_comp)); } else { var c = 0.0; for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]); var E = alpha_m * m * m / c; // -- make corrections if (E <= 5/2 * m) { var V = 0; for (var i = 0; i < m; ++i) if (M[i] == 0) ++V; if (V > 0) E = m * Math.log(m / V); } else if (E > 1/30 * pow_2_32) E = -pow_2_32 * Math.log(1 - E / pow_2_32); // -- return E; } } return {count: count}; } function fnv1a(text) { var hash = 2166136261; for (var i = 0; i < text.length; ++i) { hash ^= text.charCodeAt(i); hash += (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); } return hash >>> 0; } var words = ['aardvark', 'abyssinian', ..., 'zoology']; // 2336 words var seed = Math.floor(Math.random() * pow_2_32); // make more fun var log_log = HyperLogLog(0.065); for (var i = 0; i < words.length; ++i) log_log.count(fnv1a(words[i]) ^ seed); var count = log_log.count(); alert(count + ', error ' + (count - words.length) / (words.length / 100.0) + '%'); 
+18
May 24 '11 at 7:44
source share

Here is a slightly modified version that adds a merge operation.

Merging allows you to accept counters from multiple HyperLogLog instances, and define unique counters in general.

For example, if you have unique visitors collected on Monday, Tuesday, and Wednesday, then you can combine the buckets together and count the number of unique visitors for three days:

 var pow_2_32 = 0xFFFFFFFF + 1; function HyperLogLog(std_error) { function log2(x) { return Math.log(x) / Math.LN2; } function rank(hash, max) { var r = 1; while ((hash & 1) == 0 && r <= max) { ++r; hash >>>= 1; } return r; } var m = 1.04 / std_error; var k = Math.ceil(log2(m * m)), k_comp = 32 - k; m = Math.pow(2, k); var alpha_m = m == 16 ? 0.673 : m == 32 ? 0.697 : m == 64 ? 0.709 : 0.7213 / (1 + 1.079 / m); var M = []; for (var i = 0; i < m; ++i) M[i] = 0; function merge(other) { for (var i = 0; i < m; i++) M[i] = Math.max(M[i], other.buckets[i]); } function count(hash) { if (hash !== undefined) { var j = hash >>> k_comp; M[j] = Math.max(M[j], rank(hash, k_comp)); } else { var c = 0.0; for (var i = 0; i < m; ++i) c += 1 / Math.pow(2, M[i]); var E = alpha_m * m * m / c; // -- make corrections if (E <= 5/2 * m) { var V = 0; for (var i = 0; i < m; ++i) if (M[i] == 0) ++V; if (V > 0) E = m * Math.log(m / V); } else if (E > 1/30 * pow_2_32) E = -pow_2_32 * Math.log(1 - E / pow_2_32); // -- return E; } } return {count: count, merge: merge, buckets: M}; } function fnv1a(text) { var hash = 2166136261; for (var i = 0; i < text.length; ++i) { hash ^= text.charCodeAt(i); hash += (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); } return hash >>> 0; } 

Then you can do something like this:

 // initialize one counter per day var ll_monday = HyperLogLog(0.01); var ll_tuesday = HyperLogLog(0.01); var ll_wednesday = HyperLogLog(0.01); // add 5000 unique values in each day for(var i=0; i<5000; i++) ll_monday.count(fnv1a('' + Math.random())); for(var i=0; i<5000; i++) ll_tuesday.count(fnv1a('' + Math.random())); for(var i=0; i<5000; i++) ll_wednesday.count(fnv1a('' + Math.random())); // add 5000 values which appear every day for(var i=0; i<5000; i++) {ll_monday.count(fnv1a(''+i)); ll_tuesday.count(fnv1a('' + i)); ll_wednesday.count(fnv1a('' + i));} // merge three days together together = HyperLogLog(0.01); together.merge(ll_monday); together.merge(ll_tuesday); together.merge(ll_wednesday); // report console.log('unique per day: ' + Math.round(ll_monday.count()) + ' ' + Math.round(ll_tuesday.count()) + ' ' + Math.round(ll_wednesday.count())); console.log('unique numbers overall: ' + Math.round(together.count())); 
+4
Mar 31 '14 at 12:18
source share

We opened a project called Stream-Lib, which has a LogLog implementation . The work was based on this article .

+2
Dec 04 '11 at 12:53 on
source share

Using the jact-provided version of @actual, I tried to implement the same thing in C #, which seems pretty close. Just changed the fnv1a function and renamed it to getHashCode. (Credit goes to the Jenkins hash function, http://en.wikipedia.org/wiki/Jenkins_hash_function )

 public class HyperLogLog { private double mapSize, alpha_m, k; private int kComplement; private Dictionary<int, int> Lookup = new Dictionary<int, int>(); private const double pow_2_32 = 4294967297; public HyperLogLog(double stdError) { mapSize = (double)1.04 / stdError; k = (long)Math.Ceiling(log2(mapSize * mapSize)); kComplement = 32 - (int)k; mapSize = (long)Math.Pow(2, k); alpha_m = mapSize == 16 ? (double)0.673 : mapSize == 32 ? (double)0.697 : mapSize == 64 ? (double)0.709 : (double)0.7213 / (double)(1 + 1.079 / mapSize); for (int i = 0; i < mapSize; i++) Lookup[i] = 0; } private static double log2(double x) { return Math.Log(x) / 0.69314718055994530941723212145818;//Ln2 } private static int getRank(uint hash, int max) { int r = 1; uint one = 1; while ((hash & one) == 0 && r <= max) { ++r; hash >>= 1; } return r; } public static uint getHashCode(string text) { uint hash = 0; for (int i = 0, l = text.Length; i < l; i++) { hash += (uint)text[i]; hash += hash << 10; hash ^= hash >> 6; } hash += hash << 3; hash ^= hash >> 6; hash += hash << 16; return hash; } public int Count() { double c = 0, E; for (var i = 0; i < mapSize; i++) c += 1d / Math.Pow(2, (double)Lookup[i]); E = alpha_m * mapSize * mapSize / c; // Make corrections & smoothen things. if (E <= (5 / 2) * mapSize) { double V = 0; for (var i = 0; i < mapSize; i++) if (Lookup[i] == 0) V++; if (V > 0) E = mapSize * Math.Log(mapSize / V); } else if (E > (1 / 30) * pow_2_32) E = -pow_2_32 * Math.Log(1 - E / pow_2_32); // Made corrections & smoothen things, or not. return (int)E; } public void Add(object val) { uint hashCode = getHashCode(val.ToString()); int j = (int)(hashCode >> kComplement); Lookup[j] = Math.Max(Lookup[j], getRank(hashCode, kComplement)); } } 
+2
Jun 12 '12 at 11:05
source share

I know this is an old post, but the @buryat implementation has moved and is incomplete anyway, and a bit on the slow side (sorry o_o).

I accepted the implementation used by the new version of Redis, which can be found here and ported it to PHP. The repo is here https://github.com/joegreen0991/HyperLogLog

 <?php class HyperLogLog { private $HLL_P_MASK; private $HLL_REGISTERS; private $ALPHA; private $registers; public function __construct($HLL_P = 14) { $this->HLL_REGISTERS = (1 << $HLL_P); /* With P=14, 16384 registers. */ $this->HLL_P_MASK = ($this->HLL_REGISTERS - 1); /* Mask to index register. */ $this->ALPHA = 0.7213 / (1 + 1.079 / $this->HLL_REGISTERS); $this->registers = new SplFixedArray($this->HLL_REGISTERS); for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { $this->registers[$i] = 0; } } public function add($v) { $h = crc32(md5($v)); $h |= 1 << 63; /* Make sure the loop terminates. */ $bit = $this->HLL_REGISTERS; /* First bit not used to address the register. */ $count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */ while(($h & $bit) == 0) { $count++; $bit <<= 1; } /* Update the register if this element produced a longer run of zeroes. */ $index = $h & $this->HLL_P_MASK; /* Index a register inside registers. */ if ($this->registers[$index] < $count) { $this->registers[$index] = $count; } } public function export() { $str = ''; for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { $str .= chr($this->registers[$i]); } return $str; } public function import($str) { for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { $this->registers[$i] = isset($str[$i]) ? ord($str[$i]) : 0; } } public function merge($str) { for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { if(isset($str[$i])) { $ord = ord($str[$i]); if ($this->registers[$i] < $ord) { $this->registers[$i] = $ord; } } } } /** * @static * @param $arr * @return int Number of unique items in $arr */ public function count() { $E = 0; $ez = 0; for ($i = 0; $i < $this->HLL_REGISTERS; $i++) { if ($this->registers[$i] !== 0) { $E += (1.0 / pow(2, $this->registers[$i])); } else { $ez++; $E += 1.0; } } $E = (1 / $E) * $this->ALPHA * $this->HLL_REGISTERS * $this->HLL_REGISTERS; /* Use the LINEARCOUNTING algorithm for small cardinalities. * For larger values but up to 72000 HyperLogLog raw approximation is * used since linear counting error starts to increase. However HyperLogLog * shows a strong bias in the range 2.5*16384 - 72000, so we try to * compensate for it. */ if ($E < $this->HLL_REGISTERS * 2.5 && $ez != 0) { $E = $this->HLL_REGISTERS * log($this->HLL_REGISTERS / $ez); } else if ($this->HLL_REGISTERS == 16384 && $E < 72000) { // We did polynomial regression of the bias for this range, this // way we can compute the bias for a given cardinality and correct // according to it. Only apply the correction for P=14 that what // we use and the value the correction was verified with. $bias = 5.9119 * 1.0e-18 * ($E*$E*$E*$E) -1.4253 * 1.0e-12 * ($E*$E*$E)+ 1.2940 * 1.0e-7 * ($E*$E) -5.2921 * 1.0e-3 * $E+ 83.3216; $E -= $E * ($bias/100); } return floor($E); } } 
+2
Apr 7 '14 at 8:15
source share

I implemented loglog and hyperloglog in JS and PHP and commented well on the code https://github.com/buryat/loglog

+1
May 8 '12 at 20:05
source share



All Articles