Python: how to get the closest key to a given key?

Given a dictionary:

sample = { '123': 'Foo', '456': 'Bar', '789': 'Hello', '-111': 'World' } 

What is the most effective efficient way (approach and / or data structure) to get the closest (or less) key from the dictionary?

Note:
1. Even if the key is a string, the comparison must be numeric.
2. Keys may be "negative."

Example:

 get_nearest_less_element(sample, '456') # returns 'Bar' get_nearest_less_element(sample, '235') # returns 'Foo' get_nearest_less_element(sample, '455') # returns 'Foo' get_nearest_less_element(sample, '999') # returns 'Hello' get_nearest_less_element(sample, '0') # returns 'World' get_nearest_less_element(sample, '-110') # returns 'World' get_nearest_less_element(sample, '-999') # should return an error since it is beyond the lower bound 

Additional question :
Given the same dataset, is it better to use sorted OrderedDict or List of Tuples or any other python data structure?

+5
source share
6 answers
 def get_nearest_less_element(d, k): k = int(k) return d[str(max(key for key in map(int, d.keys()) if key <= k))] 

Change to update using @Paul Hankin code, but using <= , I'm not sure if he needs a branch at all. Convert all the keys into numbers, find those that are less than or equal to k, get the maximum value - if k is there, you will get it, otherwise you will get the next largest one - convert back to string and search in the dictionary.

Tests: https://repl.it/C2dN/0

I do not know if this is the most effective idea; since the dictionary you get is disordered, you need to iterate over each element, because any of them may be the next largest, and since you need a numerical comparison, you have to convert them all to integers. It seems to me that any other structure will require more initialization costs, since you will have to go through each element to transfer it to your structure.

But it depends on your use case - if k is likely to be in the dictionary, it makes sense to change my code to have the branch if k in d: return d[k] else: ... , because you don’t do generator expressions in this case would be faster. If this is very likely not in the dictionary, this will not help.

A pseudo-code (unverified) version that sorts their keys below - it will be slower to use once, but it may be faster to request a lot:

 # cache to store sorted keys between function calls # nb. you will have to invalidate this cache (reset to []) # when you get a new dictionary sorted_keys = [] def get_nearest_less_element(d, k): if k in d: # quick return if key is in dict return d[k] else: # costly sort of the keys, only do this once if not sorted_keys: sorted_keys = sorted(int(key) for key in d.keys()) # quick run through the sorted key list up # to latest item less than k k = int(k) nearest = sorted_keys[0] for item in sorted_keys: if item < k: nearest = item else: break return d[str(item)] 
+5
source

The module below returns a value if the key is present, in addition, it finds the maximum key in the list of keys that are smaller than the input key.

 def get_nearest_less_element(sample,key): if key in sample: return sample[key] else: return sample[str(max(x for x in sample.keys() if int(x) < int(key)))] print get_nearest_less_element(sample, '456') print get_nearest_less_element(sample, '235') print get_nearest_less_element(sample, '455') print get_nearest_less_element(sample, '999') 

Output:

Bar

Foo

Foo

Hello

Edit: Edited the answer based on Paul's comment.

+4
source

If you create or update a sample only once or infrequently, but repeatedly looking at the values, it would be most efficient to precompute the sorted number list in O (n log n) time. Then the entire dictionary does not need to be scanned; binary search gives O (log n) access. There is a python library module function, bisect, for this .

 from bisect import bisect def nearest_index(sorted_keys, elem): idx = bisect(sorted_keys, elem) if idx >= len(sorted_keys): idx = len(sorted_keys) - 1 elif idx > 0: # find closest of the two neighbors if elem <= (sorted_keys[idx-1] + sorted_keys[idx])/2.0: idx -= 1 return idx sample = {'123': 'Foo', '456': 'Bar', '789': 'Hello'} sorted_keys = sorted(int(k) for k in sample.keys()) def get_nearest_element(sample, sorted_keys, elem): elem_int = int(elem) idx_nearest = nearest_index(sorted_keys, elem_int) return sample[str(sorted_keys[idx_nearest])] for elem in ['456', '235', '455', '999']: print get_nearest_element(sample, sorted_keys, elem) 
+2
source

Given your dataset, the most efficient data structure in terms of installation and search time complexity is the binary search tree , which gives you O (n log n) and O (log n) search time complexity using O (n) complexity.

The standard BST algorithm does not include your two special restrictions (as I understand them)

  • returns the value for the maximum key <= specified key
  • linked the search space between the minimum and maximum values ​​on the map

Here is a BST implementation based on this implementation :

 class Node(object): def __init__(self, key, value, parent): self.left = None self.right = None self.value = value self.key = key self.parent = parent def __str__(self): return ":".join(map(str, (self.key, self.value))) class BinarySearchTree(object): def __init__(self): self.root = None def getRoot(self): return self.root def __setitem__(self, key, value): if(self.root == None): self.root = Node(key, value, None) else: self._set(key, value, self.root) def _set(self, key, value, node): if key == node.key: node.value = value elif key < node.key: if(node.left != None): self._set(key, value, node.left) else: node.left = Node(key, value, node) else: if(node.right != None): self._set(key, value, node.right) else: node.right = Node(key, value, node) def __contains__(self, key): return self._get(key) != None def __getitem__(self, key): if(self.root != None): return self._get(key, self.root) else: return None def _get(self, key, node): if key == node.key: return node.value elif key < node.key and node.left != None: return self._get(key, node.left) elif key > node.key and node.right != None: return self._get(key, node.right) 

Here is a subclass that fulfills requirement 1:

 class FuzzySearchTree(BinarySearchTree): def _get(self, key, node): if key == node.key: return node.value elif key < node.key: if node.left != None: return self._get(key, node.left) else: return self._checkMin(key, node) else: if node.right != None: return self._get(key, node.right) else: return node.value # found the closest match that is larger def _checkMin(self, key, node): return node.value 

To fulfill requirement 2, you need to keep track of the minimum value in the tree. You should probably do this by keeping track of the minimum value during insertion, but here is a different approach. This approach is not super-efficient, but it should still be o (3 log n) == O (log n), so that's not bad. If you really do not need this, I would not worry about it.

 class MinBoundedFuzzySearchTree(FuzzySearchTree): def _checkMin(self, key, node): # Unless the value is lower than the minimum value in the tree # Not advised next = node.parent while next.parent != None: next = next.parent # Go up the tree to the top next = next.left while next.left != None: next = next.left # Go down the tree to the left if next.key > key: return None # outside the the range of the tree # Return the max value less than the key, which is by definition the parent return node.parent.value 

Here are some pseudo tests:

 tree = BinarySearchTree() tree[123] = 'Foo' tree[456] = 'Bar' tree[789] = 'Hello' tree[-111] = 'World' print "BST(456) == 'Bar': " + str(tree[456]) print "BST(235) == None: " + str(tree[235]) print "BST(455) == None: " + str(tree[455]) print "BST(999) == None: " + str(tree[999]) print "BST(0) == None: " + str(tree[0]) print "BST(123) == 'Foo': " + str(tree[123]) print "BST(-110) == None: " + str(tree[-110]) print "BST(-999) == None: " + str(tree[-999]) tree = FuzzySearchTree() tree[123] = 'Foo' tree[456] = 'Bar' tree[789] = 'Hello' tree[-111] = 'World' print print "FST(456) == 'Bar': " + str(tree[456]) print "FST(235) == 'Foo': " + str(tree[235]) print "FST(455) == 'Foo': " + str(tree[455]) print "FST(999) == 'Hello': " + str(tree[999]) print "FST(0) == 'World': " + str(tree[0]) print "FST(123) == 'Foo': " + str(tree[123]) print "FST(-110) == 'World': " + str(tree[-110]) print "FST(-999) == 'World': " + str(tree[-999]) tree = MinBoundedFuzzySearchTree() tree[123] = 'Foo' tree[456] = 'Bar' tree[789] = 'Hello' tree[-111] = 'World' print print "MBFST(456) == 'Bar': " + str(tree[456]) print "MBFST(235) == 'Foo': " + str(tree[235]) print "MBFST(455) == 'Foo': " + str(tree[455]) print "MBFST(999) == 'Hello': " + str(tree[999]) print "MBFST(0) == 'World': " + str(tree[0]) print "MBFST(123) == 'Foo': " + str(tree[123]) print "MBFST(-110) == 'World': " + str(tree[-110]) print "MBFST(-999) == None: " + str(tree[-999]) 

And here is what it prints:

 """ BST(456) == 'Bar': Bar BST(235) == None: None BST(455) == None: None BST(999) == None: None BST(0) == None: None BST(123) == 'Foo': Foo BST(-110) == None: None BST(-999) == None: None FST(456) == 'Bar': Bar FST(235) == 'Foo': Foo FST(455) == 'Foo': Foo FST(999) == 'Hello': Hello FST(0) == 'World': World FST(123) == 'Foo': Foo FST(-110) == 'World': World FST(-999) == 'World': Foo MBFST(456) == 'Bar': Bar MBFST(235) == 'Foo': Foo MBFST(455) == 'Foo': Foo MBFST(999) == 'Hello': Hello MBFST(0) == 'World': World MBFST(123) == 'Foo': Foo MBFST(-110) == 'World': World MBFST(-999) == None: None """ 
+2
source

Here is the solution. Finds the closest key based on a numerical comparison:

 sample = {'123': 'Foo', '456': 'Bar', '789': 'Hello'} def get_nearest_less_element(inpDict, targetNum): diff = 2**32 - 1 # Very big number. currentKey = None for i in sample.keys(): newDiff = abs(int(i) - targetNum) if newDiff < diff: currentKey = i diff = newDiff return inpDict[currentKey] print(get_nearest_less_element(sample, 500)) # Prints Bar 

This is just one cycle through the dictionary, so it works in O (n) time and O (1) extra space.

+1
source

I did it as follows:

 def get_nearest_less_element(sample, key): try: if key not in sample: candidates = [] for keys in sample: if int(keys) < int(key): candidates.append(keys) return sample[max(candidates)] return sample[key] except ValueError: print("key is beyond lower bounds") 
+1
source

All Articles