Optimized processing of very large files

My task is relatively simple: for each line in the input file, check whether the line satisfies the given set of conditions, and if so, write the specific columns of this line in a new file. I wrote a python script that does this, but I would like to help 1) improve speed, 2) the best way to work in terms of column names (since column numbers can vary from file to file), and 3) the best way to specify filtering conditions and desired output columns.

1) The files I work with contain photometry for astronomical images. Each file has about 1 e6 lines per 150 columns of floats, usually larger than 1 GB. I have an old AWK script that processes such files in about 1 minute; my python script takes 5 to 7 minutes. I often have to adjust the filtering conditions and re-run several times until the output file becomes what I want, so speed is definitely desirable. I found that the for loop is pretty fast; this is how i do things inside the loop that slow it down. Using itemgetter to select only the columns that I want was a big improvement over reading the entire row in memory, but I'm not sure what I can do to further increase the speed. Could it be as fast as AWK?

2) I would like to work with column names instead of column numbers, since the number of columns of a certain size (number of photons, background, signal-to-noise, etc.) can vary between files. In my AWK script, I always need to check the correctness of the column numbers where the conditions and output columns are indicated, even if filtering and output are applied to the same quantities. My solution in python was to create a dictionary that assigns a column number to each value. When a file has different columns, I only need to specify a new dictionary. Perhaps there is a better way to do this?

3) , , script, , - . undefined. , "SNR > 4", "SNR" (-) , . eval/exec. , , ?

( - ). - , . . , , . , , / , .

,

#! /usr/bin/env python2.6

from operator import itemgetter


infile = 'ugc4305_1.phot'
outfile = 'ugc4305_1_filt.phot'

# names must belong to dicitonary
conditions = 'OBJ <= 2 and SNR1 > 4 and SNR2 > 4 and FLAG1 < 8 and FLAG2 < 8 and (SHARP1 + SHARP2)**2 < 0.075 and (CROWD1 + CROWD2) < 0.1'

input = 'OBJ, SNR1, SNR2, FLAG1, FLAG2, SHARP1, SHARP2, CROWD1, CROWD2'
    # should contain all quantities used in conditions

output = 'X, Y, OBJ, COUNTS1, BG1, ACS1, ERR1, CHI1, SNR1, SHARP1, ROUND1, CROWD1, FLAG1, COUNTS2, BG2, ACS2, ERR2, CHI2, SNR2, SHARP2, ROUND2, CROWD2, FLAG2'

# dictionary of col. numbers for the more important qunatities
columns = dict(EXT=0, CHIP=1, X=2, Y=3, CHI_GL=4, SNR_GL=5, SHARP_GL=6, ROUND_GL=7, MAJAX_GL=8, CROWD_GL=9, OBJ=10, COUNTS1=11, BG1=12, ACS1=13, STD1=14, ERR1=15, CHI1=16, SNR1=17, SHARP1=18, ROUND1=19, CROWD1=20, FWHM1=21, ELLIP1=22, PSFA1=23, PSFB1=24, PSFC1=25, FLAG1=26, COUNTS2=27, BG2=28, ACS2=29, STD2=30, ERR2=31, CHI2=32, SNR2=33, SHARP2=34, ROUND2=35, CROWD2=36, FWHM2=37, ELLIP2=38, PSFA2=39, PSFB2=40, PSFC2=41, FLAG2=42)



f = open(infile)
g = open(outfile, 'w')


# make string that extracts values for testing
input_items = []
for i in input.replace(',', ' ').split():
    input_items.append(columns[i])
input_items = ', '.join(str(i) for i in input_items)

var_assign = '%s = [eval(i) for i in itemgetter(%s)(line.split())]' % (input, input_items) 


# make string that specifies values for writing
output_items = []
for i in output.replace(',', ' ').split():
    output_items.append(columns[i])
output_items = ', '.join(str(i) for i in output_items)

output_values = 'itemgetter(%s)(line.split())' % output_items


# make string that specifies format for writing
string_format = []
for i in output.replace(',', ' ').split():
    string_format.append('%s')
string_format = ' '.join(string_format)+'\n'


# main loop
for line in f:
   exec(var_assign)
   if eval(conditions):
      g.write(string_format % tuple(eval(output_values)))
f.close()
g.close()
+5
6

, , , csv. csv.DictReader. 1 ( ) .

cProfile, Python, . , .

+2

- exec() eval(). , , , , . , eval , , , .

, , . , eval(conditions) , :

def conditions(d):
    return (d[OBJ] <= 2 and
            d[SNRI] > 4 and
            d[SNR2] > 4 and
            d[FLAG1] < 8 and ...

. , , python .

, line[COLNAME]. , , , .

+2

- ...

~ 35 ~ 3 script . ( float, ), .

csv.DictReader , . , , . ( csv (, ..), . , .)

, infile outfile , script (.. infile = sys.argv[0] ..). ... ( sys.argv infile outfile sys.stdin / sys.stdout )

def main():
    infile = 'ugc4305_1.phot'
    outfile = 'ugc4305_1_filt.phot'
    process_data(infile, outfile)

def filter_conditions(row):
    for key, value in row.iteritems():
        row[key] = float(value)

    cond = (row['OBJ'] <= 2 and row['SNR1'] > 4 
       and row['SNR2'] > 4 and row['FLAG1'] < 8 
       and row['FLAG2'] < 8 
       and (row['SHARP1'] + row['SHARP2'])**2 < 0.075 
       and (row['CROWD1'] + row['CROWD2']) < 0.1
       )
    return cond

def format_output(row):
    output_columns = ('X', 'Y', 'OBJ', 'COUNTS1', 'BG1', 'ACS1', 'ERR1', 'CHI1', 
                     'SNR1', 'SHARP1', 'ROUND1', 'CROWD1', 'FLAG1', 'COUNTS2', 
                     'BG2', 'ACS2', 'ERR2', 'CHI2', 'SNR2', 'SHARP2', 'ROUND2', 
                     'CROWD2', 'FLAG2')
    delimiter = '\t'
    return delimiter.join((row[name] for name in output_columns))

def process_data(infilename, outfilename):
    column_names = ('EXT', 'CHIP', 'X', 'Y', 'CHI_GL', 'SNR_GL', 'SHARP_GL', 
                    'ROUND_GL', 'MAJAX_GL', 'CROWD_GL', 'OBJ', 'COUNTS1', 
                    'BG1', 'ACS1', 'STD1', 'ERR1', 'CHI1', 'SNR1', 'SHARP1', 
                    'ROUND1', 'CROWD1', 'FWHM1', 'ELLIP1', 'PSFA1', 'PSFB1', 
                    'PSFC1', 'FLAG1', 'COUNTS2', 'BG2', 'ACS2', 'STD2', 
                    'ERR2', 'CHI2', 'SNR2', 'SHARP2', 'ROUND2', 'CROWD2', 
                    'FWHM2', 'ELLIP2', 'PSFA2', 'PSFB2', 'PSFC2', 'FLAG2')

    with open(infilename) as infile:
        with open(outfilename, 'w') as outfile:
            for line in infile:
                line = line.strip().split()
                row = dict(zip(column_names, line))
                if filter_conditions(row.copy()):
                    outfile.write(format_output(row) + '\n')

if __name__ == '__main__':
    main()
+2

nmichaels, fieldnames dialect csv.DictReader . . eval ,

if data_item['OBJ'] <= 2 and data_item['SNR1']:
     g.write(data_item['X'], data_item['Y'], data_item['OBJ'])

, , - eval s. .

+1

, , , , Python.

, pickle/cPickle. pickle/cpickle script.

In python , , , . ( # , , python).

If the IO disk is the neck of a bottle, you can also try to pin your input files and read them using the gzip module .

0
source

Have you tried pandas ?

I believe that OBJ, SNR1, ... are column names, and I hope you apply the same condition to all of your rows. If in this case I suggest you go with pandas.

Your code snippet will look something like this:

import pandas as pd

infile = 'ugc4305_1.phot'
outfile = 'ugc4305_1_filt.phot'

df = pd.read_csv(infile)

condition = (df['OBJ'] <= 2) & (df['SRN1'] > 4) & (df['SRN2'] > 4) & (df['FLAG1'] < 8) & (df['FLAG2'] < 8) & ((df['SHARP1'] + df['SHARP2'])**2 < 0.075) & ((df['CROWD1'] + df['CROWD2']) < 0.1)
newDf = df[condition]

columnNames = ['col1', 'col2', ...] # column names you want in result

newDf = df[columnNames]

newDf.to_csv(outfile)
0
source

All Articles