The following @kohlehydrat suggestion:
import urllib2 class TldMatcher(object): # use class vars for lazy loading MASTERURL = "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1" TLDS = None @classmethod def loadTlds(cls, url=None): url = url or cls.MASTERURL # grab master list lines = urllib2.urlopen(url).readlines() # strip comments and blank lines lines = [ln for ln in (ln.strip() for ln in lines) if len(ln) and ln[:2]!='//'] cls.TLDS = set(lines) def __init__(self): if TldMatcher.TLDS is None: TldMatcher.loadTlds() def getTld(self, url): best_match = None chunks = url.split('.') for start in range(len(chunks)-1, -1, -1): test = '.'.join(chunks[start:]) startest = '.'.join(['*']+chunks[start+1:]) if test in TldMatcher.TLDS or startest in TldMatcher.TLDS: best_match = test return best_match def get2ld(self, url): urls = url.split('.') tlds = self.getTld(url).split('.') return urls[-1 - len(tlds)] def test_TldMatcher(): matcher = TldMatcher() test_urls = [ 'site.co.uk', 'site.com', 'site.me.uk', 'site.jpn.com', 'site.org.uk', 'site.it' ] errors = 0 for u in test_urls: res = matcher.get2ld(u) if res != 'site': print "Error: found '{0}', should be 'site'".format(res) errors += 1 if errors==0: print "Passed!" return (errors==0)
Hugh bothwell
source share