SQLite join expression problem

I have two data files I'm working with. One contains a list of words with some additional information about these words, and the other contains pairs of words (where words are listed by their word identifiers from the first table) and their frequencies.

Lexicon file (output sample)

('wID', 'w1', 'w1cs', 'L1', 'c1')
('-----', '-----', '-----', '-----', '-----')
(1, ',', ',', ',', 'y')
(2, '.', '.', '.', 'y')
(3, 'the', 'the', 'the', 'at')
(4, 'and', 'and', 'and', 'cc')
(5, 'of', 'of', 'of', 'io')

Bigram file (sample output)

('freq', 'w1', 'w2')
(4, 22097, 161)
(1, 98664, 1320)
(1, 426515, 1345)
(1, 483675, 747)
(19, 63, 15496)
(2, 3011, 7944)
(1, 27985, 27778)

I created two tables using SQLite and loaded the data from the files above.

conn = sqlite3.connect('bigrams.db')
conn.text_factory = str
c = conn.cursor()
c.execute('pragma foreign_keys=ON')

Lexicon table

c.execute('''CREATE TABLE lex
            (wID INT PRIMARY KEY, w1 TEXT, w1cs TEXT, L1 TEXT, c1 TEXT)''')

#I removed this index as per CL. suggestion
#c.execute('''DROP INDEX IF EXISTS lex_index''') 
#c.execute('''CREATE INDEX lex_index ON lex (wID, w1, c1)''')

#and added this one
c.execute('''CREATE INDEX lex_w1_index ON lex (w1)''')

Insert data into vocabulary table

#I replaced this code
# with open('/Users/.../lexicon.txt', "rb") as lex_file:
#    for line in lex_file:
#        currentRow = line.split('\t')
#        try:
#            data = [currentRow[0], currentRow[1], currentRow[2], currentRow[3], str(currentRow[4].strip('\r\n'))]
#           c.executemany ('insert or replace into lex values (?, ?, ?, ?, ?)', (data,))
#        except IndexError:
#            pass   


#with the one that Julian wrote

blocksize = 100000

with open('/Users/.../lexicon.txt', "rb") as lex_file:
    data = []
    line_counter = 0
    for line in lex_file:
        data.append(line.strip().split('\t'))
        line_counter += 1
        if line_counter % blocksize == 0:
            try:
                c.executemany ('insert or replace into lex values (?, ?, ?, ?, ?)', data)
                conn.commit()
            except IndexError:
                block_start = line_counter - blocksize + 1
                print 'Lex error lines {}-{}'.format(block_start, line_counter)
            finally:
                data = []

Bigram table

#I replaced this code to create table x2 
#c.execute('''CREATE TABLE x2
#             (freq INT, w1 INT, w2 INT, FOREIGN KEY(w1) REFERENCES lex(wID), FOREIGN KEY(w2) REFERENCES lex(wID))''')

#with the code that Julian suggested
c.execute('''CREATE TABLE x2
             (freq INT, w1 INT, w2 INT,
              FOREIGN KEY(w1) REFERENCES lex(wID),
              FOREIGN KEY(w2) REFERENCES lex(wID),
              PRIMARY KEY(w1, w2) )''')

Insert data into bigram table

#Replaced this code
#with open('/Users/.../x2.txt', "rb") as x2_file:
#    for line in x2_file:
#        currentRow = line.split('\t')
#        try:
#            data = [str(currentRow[0].replace('\x00','').replace('\xff\xfe','')), str(currentRow[1].replace('\x00','')), str(currentRow[2].replace('\x00','').strip('\r\n'))]
#           c.executemany('insert or replace into x2 values (?, ?, ?)', (data,))
#        except IndexError:
#            pass

#with this one suggested by Julian 
with open('/Users/.../x2.txt', "rb") as x2_file:
    data = []
    line_counter = 0
    for line in x2_file:
        data.append(line.strip().replace('\x00','').replace('\xff\xfe','').split('\t'))
        line_counter += 1
        if line_counter % blocksize == 0:
            try:
                c.executemany('insert or replace into x2 values (?, ?, ?)', data)
                conn.commit()
            except IndexError:
                block_start = line_counter - blocksize + 1
                print 'x2 error lines {}-{}'.format(block_start, line_counter)
            finally:
                data = []

conn.close()

I want to be able to check if a given pair of words exists in the data - for example, "like new"

When I specify only the first word, the program works fine.

cur.execute('''SELECT lex1.w1, lex2.w1 from x2 
                INNER JOIN lex as lex1 ON lex1.wID=x2.w1
                INNER JOIN lex as lex2 ON lex2.wID=x2.w2
                WHERE lex1.w1= "like" ’’’)

But when I want to find a few words, the code is very slow.

cur.execute('''SELECT lex1.w1, lex2.w1 from x2 
                    INNER JOIN lex as lex1 ON lex1.wID=x2.w1
                    INNER JOIN lex as lex2 ON lex2.wID=x2.w2
                    WHERE lex1.w1="like" AND lex2.w1= "new" ''')

, . .

+4
3

x2 .

c.execute('''CREATE TABLE x2
             (freq INT, w1 INT, w2 INT,
              FOREIGN KEY(w1) REFERENCES lex(wID),
              FOREIGN KEY(w2) REFERENCES lex(wID),
              PRIMARY KEY(w1, w2) )''')

, , . , (w1, w2) , , , .


, , .

c.execute('''
    create table x2_new (
        freq INT, w1 INT, w2 INT,
        FOREIGN KEY(w1) REFERENCES lex(wID),
        FOREIGN KEY(w2) REFERENCES lex(wID),
        PRIMARY KEY(w1, w2) )
''')
c.execute('insert into x2_new select * from x2')
c.execute('drop table x2')
c.execute('alter table x2_new rename to x2')
conn.commit()

.

blocksize = 100000

with open('/Users/.../lexicon.txt', "rb") as lex_file:
    data = []
    line_counter = 0
    for line in lex_file:
        data.append(line.strip().split('\t'))
        line_counter += 1
        if line_counter % blocksize == 0:
            try:
                c.executemany ('insert or replace into lex values (?, ?, ?, ?, ?)', data)
                conn.commit()
            except IndexError:
                block_start = line_counter - blocksize + 1
                print 'Lex error lines {}-{}'.format(block_start, line_counter)
                conn.rollback()
            finally:
                data = []

with open('/Users/.../x2.txt', "rb") as x2_file:
    data = []
    line_counter = 0
    for line in x2_file:
        data.append(line.strip().replace('\x00','').replace('\xff\xfe','').split('\t'))
        line_counter += 1
        if line_counter % blocksize == 0:
            try:
                c.executemany('insert or replace into x2 values (?, ?, ?)', data)
                conn.commit()
            except IndexError:
                block_start = line_counter - blocksize + 1
                print 'x2 error lines {}-{}'.format(block_start, line_counter)
                conn.rollback()
            finally:
                data = []
+2

EXPLAIN , x2, lex x2, . lex , x2 - .

, x2. . ( lex_index , wID ( ).

, w1:

CREATE INDEX lex_w1_index ON lex(w1);

x2, , :

CREATE INDEX x2_w1_w2_index ON x2(w1, w2);

(. ).


, :

SELECT freq
FROM x2
WHERE w1 = (SELECT wID FROM lex WHERE w1 = 'like')
  AND w2 = (SELECT wID FROM lex WHERE w1 = 'new')

; , . ( , .)

+3

, , . , :

DROP TABLE IF EXISTS x2_temp;
CREATE TABLE x2_temp AS
    SELECT lex.*, x2.w2 from x2 
        INNER JOIN lex ON lex.wID=x2.w1
        WHERE lex.w1 = 'like';

SELECT x2_temp.*, lex.* from x2_temp
    INNER JOIN lex ON lex.wID=x2_temp.w2
    WHERE lex.w1 = 'new';

, ( , ):

SELECT x.*, lex.* FROM 
    (SELECT lex.*, x2.w2 FROM x2 
        INNER JOIN lex ON lex.wID=x2.w1
        WHERE lex.w1 = 'like') AS x
    INNER JOIN lex ON lex.wID=x.w2
    WHERE lex.w1 = 'new';

( sqlite3, , , .)

+2
source

All Articles