Speeding up pandas file system iteration

I have a dateframe with date and values,

 Date     Price
Jun 30    95.60
Jun 29    94.40
Jun 28    93.59
Jun 27    92.04
Jun 24    93.40
Jun 23    96.10
Jun 22    95.55
Jun 21    95.91
Jun 20    95.10
Jun 17    95.33
Jun 16    97.55
Jun 15    97.14
Jun 14    97.46
Jun 13    97.34
Jun 10    98.83
Jun 9     99.65
Jun 8     98.94
Jun 7     99.03
Jun 6     98.63
Jun 3     97.92
Jun 2     97.72

There is a function that iterates by date,

indic_up = [False, False,False, False]
i = 4
while i+4 <= df.index[-1]:
    if (df.get_value(i, 'value') > df.get_value(i-1, 'value')) or
        (df.get_value(i, 'value') > df.get_value(i-2, 'value')) or
        (df.get_value(i, 'value') > df.get_value(i-3, 'value')) or
        (df.get_value(i, 'value') > df.get_value(i-4, 'value')):indic_up.append(True)
    else:indic_up.append(False)
    i = i+1

The logic of this function is that valuetoday is more than yesterday, the day before, or before, and then trueor false. It seems to me that these functions are very slow, since I can rewrite this function like these

for index, row in df.iterrows():
row['a'], index

or

for idx in df.index:
df.ix[idx, 'a'], idx

or can I achieve faster data conversion to a numpy array?

+4
source share
2 answers

Invite Scipytoo!

: 4, . , , , False. , , . Scipy minimum_filter.

:

from scipy.ndimage.filters import minimum_filter

# Extract values from relevant column into a NumPy array for further procesing
A = df['value'].values

# Look for no match with interval-ed min & look for NOT matching for True as o/p
indic_up_out = A != minimum_filter(A,footprint=np.ones((5,)),origin=2)

# Set first four as False because those would be invalid with a 5 elem runway
indic_up_out[:4] = 0
+2

. . , , . @Divakar .

import pandas as pd
import timeit
import numpy as np

df = pd.DataFrame({'Date':['Jun 30', 'Jun 29', 'Jun 28', 'Jun 27', 'Jun 24', 'Jun 23', 'Jun 22', 'Jun 21', 'Jun 20', 'Jun 17', 
                        'Jun 16','Jun 15', 'Jun 14', 'Jun 13', 'Jun 10', 'Jun 9', 'Jun 8', 'Jun 7', 'Jun 6', 'Jun 3', 'Jun 2'], 
            'value': ['95.60', '94.40', '93.59', '92.04', '93.40', '96.10', '95.55', '95.91', '95.10', '95.33', '97.55', 
                        '97.14', '97.46', '97.34', '98.83', '99.65', '98.94', '99.03', '98.63', '97.92', '97.72']})


def by_df_get_value():
    indic_up = [False, False,False, False]
    i = 4
    while i+4 <= df.index[-1]:
        if (df.get_value(i, 'value') > df.get_value(i-1, 'value')) or \
        (df.get_value(i, 'value') > df.get_value(i-2, 'value')) or \
        (df.get_value(i, 'value') > df.get_value(i-3, 'value')) or \
        (df.get_value(i, 'value') > df.get_value(i-4, 'value')):

        indic_up.append(True)
    else:
        indic_up.append(False)
    i = i+1


def by_list():
    indic_up = [False, False,False, False]
    values = df['value'].tolist()
    for i, v in enumerate(values):
        if i < 4:
            continue
        if (v > values[i-1]) or \
            (v > values[i-2]) or \
            (v > values[i-3]) or \
            (v > values[i-4]):
            indic_up.append(True)
        else:
            indic_up.append(False)

total_time = []
for i in range(10):
    t = timeit.Timer('by_df_get_value()','from __main__ import by_df_get_value').timeit(number=1)
    total_time.append(t)
print('by_df_get_value(): ', '{:.20f}'.format(np.mean(total_time)))


total_time = []
for i in range(10):
    t = timeit.Timer('by_list()','from __main__ import by_list').timeit(number=1)
    total_time.append(t)
print('by_list', '{:.20f}'.format(np.mean(total_time)))

:

by_df_get_value():  0.00015220100467558951
by_list():  0.00002649170055519790
+1

All Articles