And this is additional code for my other answer. The meat is in edb.py. Launch from the Python console and follow the examples. Or use the web2py controller and run it in your browser.
Save this as edb.py:
import MySQLdb import sys connection = MySQLdb.connect (host = "localhost", user = "root", passwd = "x", db = "y") cursor = connection.cursor() query_counter = 0 print_queries = False limit = 1000 def fetch_one( query ): global query_counter, print_queries query = query + ' LIMIT ' + str(limit) if print_queries: print query cursor = connection.cursor() cursor.execute( query ) query_counter += 1 result = cursor.fetchone() if result: return result[0] else: return None def fetch_all( query ): global query_counter, print_queries query = query + ' LIMIT ' + str(limit) if print_queries: print query cursor = connection.cursor() cursor.execute( query ) query_counter += 1 return cursor.fetchall() def _flatten( list_of_lists ): import itertools return list(itertools.chain(*list_of_lists)) #Example: e._search_by_name('steve martin') def _search_by_name( name, operator = '=' ): typed, ranked = {}, [] if name: name = name.strip() if not name: return ( typed, ranked ) filler = '' if operator == '=' else '%' ranks = {} #to filter meaningful stuff for every mid returned order by the number of types they have #search for value text if prop. is #select * from ns where value = 'the king' and (property = '/m/01gr' or property = '/m/06b'); name_mid = _mid( '/type/object/name' ) alias_mid = _mid( '/common/topic/alias' ) query = "select ns.source from ns where ns.value %s '%s%s' and ns.property in ('%s', '%s')" % ( operator, name, filler, name_mid, alias_mid ) for i in fetch_all( query ): typed[ i[0] ] = _types( i[0] ) import operator ranked = [ ( len( typed[i] ), i ) for i in typed ] ranked = [ e[1] for e in sorted( ranked, key=operator.itemgetter(0), reverse = True ) ] return (typed, ranked) #Example: e._children('') <---will get the top level domains # e._children('/film') <---get all types from the domain # e._children('/film/film') <---get all properties for the type def _children( parent, expand = False, raw = False ): query = "select t.source, t.value from types t where t.destination = '%s'" % (parent) res = fetch_all( query ) if raw: return [ row[0] for row in res ] if expand: prefix = parent else: prefix = '' return [ prefix + '/' + row[1] for row in fetch_all(query) ] #Example: e._parent('/film/film/songs') def _parent( child ): # '/people/marriage/to' -> '/people/marriage' #if not isinstance( child, str ): return None # what kind of safety mechanisms do we need here? return '/'.join(child.split('/')[:-1]) #Example: e._domains() def _domains(): return _children('') #Example: e._top_level_types() def _top_level_types(): return _children('/type') #TODO get all primitive types #Example: e._mid('/type/object') # e._mid('/authority/imdb/name/nm0000188') def _mid( key ): if key == '': return None elif key == '/': key = '/boot/root_namespace' parts = key.split('/') if parts[1] == 'm': #already a mid return key namespace = '/'.join(parts[:-1]) key = parts[-1] return fetch_one( "select source from types t where t.destination = '%s' and t.value = '%s'" % (namespace, key) ) #Example: e._key('/type') def _key( mid ): if isinstance( mid, str): res = _keys( mid ) if not res: return None rt = [ r for r in res if r.startswith( '/type' ) ] if rt: return rt[0] else: return res[0] elif isinstance( mid, list ) or isinstance( mid, tuple ): res = [ _key( e ) for e in mid ] return [ r for r in res if r is not None ] else: return None def _keys( mid ): # check for '/type/object/key' as well? query = "select t.destination, t.value from types t where t.source = '%s'" % mid return [ row[0]+'/'+row[1] for row in fetch_all( query ) ] #Example: e._types('/m/0p_47') def _types( mid ): tm = _mid( '/type/object/type' ) query = "select l.destination from links l where l.source = '%s' and l.property = '%s'" % (mid, tm) return [ row[0] for row in fetch_all( query ) ] #Example: e._props_n('/m/0p_47') <---Named immediate properties (like name, etc.) def _props_n( mid ): #the same property can be set more than once per topic! query = "select ns.property from ns where ns.source = '%s'" % (mid) return list( set( [ row[0] for row in fetch_all( query ) ] ) ) #Example: e._props_l('/m/0p_47') <---All remote properties, some are named, some are anonymous def _props_l( mid ): #the same property can be set more than once per topic! tm = _mid( '/type/object/type' ) #exclude types, they have tons of instance links res = fetch_all( "select l.property, l.destination from links l where l.source = '%s' and property <> '%s'" % (mid, tm) ) output = {} for r in res: dests = output.get( r[0], False ) if dests: dests.append( r[1] ) else: output[ r[0] ] = [ r[1] ] return output #Example: e._props_ln('/m/0p_47') <---All remote named properties def _props_ln( mid ): #named properties result = [] ps = _props_l( mid ) common_topic = _mid( '/common/topic' ) for p in ps: ts = _types( ps[p][0] ) if common_topic in ts: #it a common topic result.append( p ) return result #Example: e._props_la('/m/0p_47') <---All remote anonymous properties, these actually belong to the children! #instead of has type /common/topic we used to check if it has name def _props_la( mid, raw = True ): #anonymous properties (blank nodes in RDF?) result = [] ps = _props_l( mid ) common_topic = _mid( '/common/topic' ) for p in ps: ts = _types( ps[p][0] ) if common_topic not in ts: #it is not a common topic t = _key( _types( ps[p][0] ) ) if t and '/type/type' not in t: #FIXME: hack not to go into types, could be done better result.append( _children( t[0], expand=True, raw=raw ) ) #get the first, is this correct? return _flatten( result ) #it is a list of lists #FIXME: try to get '/film/actor/film' -> '/type/property/expected_type' -> '/film/performance' -> properties/children #instead of trying is something has name #Example: e._get_n('/m/0p_47', e._props_n('/m/0p_47')[0])['/lang/en'] <---These come with a namespace def _get_n( mid, prop ): #the same property can be set more than once per topic! p = _mid( prop ) query = "select ns.value from ns where ns.source = '%s' and ns.property = '%s'" % (mid, p) return [ r[0] for r in fetch_all( query ) ] #Example: e._get_l('/m/0p_47', e._props_l('/m/0p_47')[0]) <---returns a list of mids coresponding to that prop. # e._name(e._get_l('/m/0p_47', '/film/writer/film')) def _get_l( mid, prop ): #the same property can be set more than once per topic! p = _mid( prop ) query = "select l.destination from links l where l.source = '%s' and l.property = '%s'" % (mid, p) return [ row[0] for row in fetch_all( query ) ] #Example: e._name(e._get_ln('/m/0p_47', e._props_ln('/m/0p_47')[0])) def _get_ln( mid, p ): #just alias for _get_l, keeping for consistency return _get_l( mid, p ) #Example: e._name(e._get_la('/m/0p_47', '/film/performance/film')) def _get_la( mid, prop ): result = [] ps = _props_l( mid ) for p in ps: es = _get_l( mid, p ) #get the destinations if not es: continue ts = set( _types( es[0] ) ) if _mid(_parent(_key(_mid(prop)))) in ts: #should be able to do this more efficiently!!! for e in es: result.append( _get_l( e, prop ) ) return _flatten( result ) #return after the first result #How do we determine properties with multiple values vs those with singular (ie place of birth)? #is this in the ontology? #Ans: yes, /type/property/unique #Example: e._all_names_ln('/m/0p_47') <---gets all of object remote named properties def _all_names_ln( mid ): result = {} for p in _props_ln( mid ): result[ _key(p) ] = _name( _get_ln( mid, p ) ) return result #Example: e._all_names_la('/m/0p_47') <---gets all of object remote anonymous properties def _all_names_la( mid ): #TODO: prevent loops, run e.all_names_la('/m/0p_47') result = {} for p in _props_la( mid ): result[ _key( p ) ] = _name ( _get_la( mid, p ) ) return result #FIXME: _all_names_la is going into destinations which are types and have a ton of instance links... #Example: e._name('/m/0p_47') <---the name of a topic # def _name( mid ): if isinstance( mid, str ): nm = _mid( '/type/object/name' ) return _get_n( mid, nm ) elif isinstance( mid, list ) or isinstance( mid, tuple ) or isinstance( mid, set ): return [ _name( e ) for e in mid ] else: return None #for internal use only def _get_linked( mid ): tm = _mid( '/type/object/type' ) #exclude types, they have tons of instance links query = "select destination from links where source = '%s' and property <> '%s' " % ( mid, tm ) return set( [ r[0] for r in fetch_all( query ) ] ) #for internal use only def _get_connections_internal( entity1, target, path, all_paths, depth, max_depth): import copy if depth > max_depth: return if True: print print str(entity1) + ', ' + str(target) print str( path ) print str( all_paths ) print depth path.append( entity1 ) linked1 = _get_linked( entity1 ) if target in linked1 or entity1 == target: path.append( target ) all_paths.append( path ) #print str( path ) return for l1 in linked1: if l1 in path: continue _get_connections_internal( l1, target, copy.copy( path ), all_paths, depth+1, max_depth ) #Example: e._name(e._get_connections('/m/0p_47', '/m/0cwtm')) <---find path in the graph between the two entities def _get_connections( entity1, target ): result = [] _get_connections_internal( entity1, target, [], result, 0, 2 ) return result #for internal use only def _get_connections_internal2( entity1, entity2, path1, path2, all_paths, depth, max_depth, level ): import copy if depth > max_depth: return if level < 0: level = 0 path1.append( entity1 ) path2.append( entity2 ) if entity1 == entity2 and level == 0: all_paths.append( ( path1, path2 ) ) #no need to append entity1 or entity2 to the paths return linked1 = _get_linked( entity1 ) if entity2 in linked1 and entity2 not in path1 and level == 0: path1.append( entity2 ) all_paths.append( ( path1, path2 ) ) return linked2 = _get_linked( entity2 ) if entity1 in linked2 and entity1 not in path2 and level == 0: path2.append( entity1 ) all_paths.append( ( path1, path2 ) ) return inters = linked1.intersection( linked2 ) inters = inters.difference( set( path1 ) ) inters = inters.difference( set( path2 ) ) if inters and level == 0: for e in inters: #these are many paths, have to clone p1 = copy.copy( path1 ) p1.append( e ) p2 = copy.copy( path2 ) p2.append( e ) all_paths.append( ( p1,p2 ) ) return for l1 in linked1: if l1 in path1 or l1 in path2: continue for l2 in linked2: if l2 in path1 or l2 in path2: continue _get_connections_internal2( l1, l2, copy.copy( path1 ), copy.copy( path2 ), all_paths, depth+1, max_depth, level - 1 ) #Example: e._name(e._get_connections2('/m/0p_47', '/m/0cwtm')) <---returns two meeting paths starting from both entities # e._name(e._get_connections('/m/0p_47', '/m/0cwtm', level=1)) <---search deeper # e._name(e._get_connections('/m/0p_47', '/m/0cwtm', level=2)) <---even deeper def _get_connections2( entity1, entity2, level = 0 ): result = [] _get_connections_internal2( entity1, entity2, [], [], result, 0, 15, level ) return result
And here is a sample web2py controller (just copy edb.py to the web2py model directory):
# -*- coding: utf-8 -*- def mid_to_url( mid ): return mid.split('/')[2] def index(): form = FORM( TABLE( TR( INPUT(_name='term', _value=request.vars.term ) ), TR(INPUT(_type='submit', _value='Search') ) ), _method='get') typed, ranked = _search_by_name( request.vars.term ) rows = [] for r in ranked: keys = [] for t in typed[r]: k = _key( t ) if k: keys.append( k ) rows.append( TR( TD( A(_name( r ), _href = URL('result', args = [mid_to_url(r)]))), TD( XML( '<br/>'.join( keys ) ) ) ) ) result = TABLE( *rows ) return { 'form': form, 'result' : result } def result(): path, data = '', '' if not request.args: return { 'path':path, 'data':data} path_rows = [] for ra in range(len(request.args)): if ra%2: arrow_url = URL( 'static', 'images/blue_arr.png' ) display_name = _key('/m/'+request.args[ra])