I have two data files I'm working with. One contains a list of words with some additional information about these words, and the other contains pairs of words (where words are listed by their word identifiers from the first table) and their frequencies.
Lexicon file (output sample)
('wID', 'w1', 'w1cs', 'L1', 'c1')
('-----', '-----', '-----', '-----', '-----')
(1, ',', ',', ',', 'y')
(2, '.', '.', '.', 'y')
(3, 'the', 'the', 'the', 'at')
(4, 'and', 'and', 'and', 'cc')
(5, 'of', 'of', 'of', 'io')
Bigram file (sample output)
('freq', 'w1', 'w2')
(4, 22097, 161)
(1, 98664, 1320)
(1, 426515, 1345)
(1, 483675, 747)
(19, 63, 15496)
(2, 3011, 7944)
(1, 27985, 27778)
I created two tables using SQLite and loaded the data from the files above.
conn = sqlite3.connect('bigrams.db')
conn.text_factory = str
c = conn.cursor()
c.execute('pragma foreign_keys=ON')
Lexicon table
c.execute('''CREATE TABLE lex
(wID INT PRIMARY KEY, w1 TEXT, w1cs TEXT, L1 TEXT, c1 TEXT)''')
#I removed this index as per CL. suggestion
#c.execute('''DROP INDEX IF EXISTS lex_index''')
#c.execute('''CREATE INDEX lex_index ON lex (wID, w1, c1)''')
#and added this one
c.execute('''CREATE INDEX lex_w1_index ON lex (w1)''')
Insert data into vocabulary table
blocksize = 100000
with open('/Users/.../lexicon.txt', "rb") as lex_file:
data = []
line_counter = 0
for line in lex_file:
data.append(line.strip().split('\t'))
line_counter += 1
if line_counter % blocksize == 0:
try:
c.executemany ('insert or replace into lex values (?, ?, ?, ?, ?)', data)
conn.commit()
except IndexError:
block_start = line_counter - blocksize + 1
print 'Lex error lines {}-{}'.format(block_start, line_counter)
finally:
data = []
Bigram table
c.execute('''CREATE TABLE x2
(freq INT, w1 INT, w2 INT,
FOREIGN KEY(w1) REFERENCES lex(wID),
FOREIGN KEY(w2) REFERENCES lex(wID),
PRIMARY KEY(w1, w2) )''')
Insert data into bigram table
with open('/Users/.../x2.txt', "rb") as x2_file:
data = []
line_counter = 0
for line in x2_file:
data.append(line.strip().replace('\x00','').replace('\xff\xfe','').split('\t'))
line_counter += 1
if line_counter % blocksize == 0:
try:
c.executemany('insert or replace into x2 values (?, ?, ?)', data)
conn.commit()
except IndexError:
block_start = line_counter - blocksize + 1
print 'x2 error lines {}-{}'.format(block_start, line_counter)
finally:
data = []
conn.close()
I want to be able to check if a given pair of words exists in the data - for example, "like new"
When I specify only the first word, the program works fine.
cur.execute('''SELECT lex1.w1, lex2.w1 from x2
INNER JOIN lex as lex1 ON lex1.wID=x2.w1
INNER JOIN lex as lex2 ON lex2.wID=x2.w2
WHERE lex1.w1= "like" ’’’)
But when I want to find a few words, the code is very slow.
cur.execute('''SELECT lex1.w1, lex2.w1 from x2
INNER JOIN lex as lex1 ON lex1.wID=x2.w1
INNER JOIN lex as lex2 ON lex2.wID=x2.w2
WHERE lex1.w1="like" AND lex2.w1= "new" ''')
, .
.