The fastest will probably be np.einsum:
np.einsum('...j,...j->...', vf, vf)
numpy, , . , 32- , np.arange. , dtype, np.int64, np.double:
>>> np.einsum('...j,...j->...', vf,vf)[-1, -1, -1]
-603979762
>>> np.einsum('...j,...j->...', vf,vf).dtype
dtype('int32')
>>> np.einsum('...j,...j->...', vf,vf, dtype=np.int64)[-1, -1, -1]
7599823767207950
>>> np.einsum('...j,...j->...', vf,vf, dtype=np.double)[-1, -1, -1]
7599823767207950.0