You can write a cython function to speed up the calculation, the first thing you need to do is get the address of the fortran SPHJ function, here's how to do it in Python:
from scipy import special as sp sphj = sp.specfun.sphj import ctypes addr = ctypes.pythonapi.PyCObject_AsVoidPtr(ctypes.py_object(sphj._cpointer))
Then you can call the fortran function directly in Cython, note that I use prange() to use multicore to speed up the calculation:
%%cython -c-Ofast -c-fopenmp --link-args=-fopenmp from cpython.mem cimport PyMem_Malloc, PyMem_Free from cython.parallel import prange import numpy as np import cython from cpython cimport PyCObject_AsVoidPtr from scipy import special ctypedef void (*sphj_ptr) (const int *n, const double *x, const int *nm, const double *sj, const double *dj) nogil cdef sphj_ptr _sphj=<sphj_ptr>PyCObject_AsVoidPtr(special.specfun.sphj._cpointer) @cython.wraparound(False) @cython.boundscheck(False) def cython_sphj2(int n, double[::1] x): cdef int count = x.shape[0] cdef double * sj = <double *>PyMem_Malloc(count * sizeof(double) * (n + 1)) cdef double * dj = <double *>PyMem_Malloc(count * sizeof(double) * (n + 1)) cdef int * mn = <int *>PyMem_Malloc(count * sizeof(int)) cdef double[::1] res = np.empty(count) cdef int i if count < 100: for i in range(x.shape[0]): _sphj(&n, &x[i], mn + i, sj + i*(n+1), dj + i*(n+1)) res[i] = sj[i*(n+1) + n] #choose the element you want here else: for i in prange(count, nogil=True): _sphj(&n, &x[i], mn + i, sj + i*(n+1), dj + i*(n+1)) res[i] = sj[i*(n+1) + n] #choose the element you want here PyMem_Free(sj) PyMem_Free(dj) PyMem_Free(mn) return res.base
To compare, here is the Python function calling sphj() in forloop:
import numpy as np def python_sphj(n, x): sphj = special.specfun.sphj res = np.array([sphj(n, v)[1][n] for v in x]) return res
Here are the% timit results for 10 items:
x = np.linspace(1, 2, 10) r1 = cython_sphj2(4, x) r2 = python_sphj(4, x) assert np.allclose(r1, r2) %timeit cython_sphj2(4, x) %timeit python_sphj(4, x)
result:
10000 loops, best of 3: 21.5 Β΅s per loop 10000 loops, best of 3: 28.1 Β΅s per loop
Here are the results for 100,000 items:
x = np.linspace(1, 2, 100000) r1 = cython_sphj2(4, x) r2 = python_sphj(4, x) assert np.allclose(r1, r2) %timeit cython_sphj2(4, x) %timeit python_sphj(4, x)
result:
10 loops, best of 3: 44.7 ms per loop 1 loops, best of 3: 231 ms per loop