Custom vectorization with arrays

I process the data of cloud points (150 thousand points per cloud). I would like for each point (x, y) to calculate the distance to the control point O and azimuth:

for each point p in points
    dx = p.x - ox
    dy = p.y - oy
    d = hypot(dx, dy)
    az = atan2(dy, dx)

I have a manual implementation of SSE. I was hoping to make the code more understandable using my own:

ArrayXf x(points.size()), y(points.size());
for(unsigned i=0; i<points.size(); ++i) {
    x[i] = points[i].x;
    y[i] = points[i].y;
}
const ArrayXf d = (dx.square() + dy.square()).sqrt();
// implement a polynomial approximation to atan (same as the SSE)

However, from my temporary experiments, this does not seem to be vectorized at all, because the time is the same as when implementing the baseline. And I know that SSE2 is enabled, since I am compiling some SSE2 code in the same file.

However, according to the document, Eigen uses SSE2 with its support (and AVX in 3.3). Is it only for vector and matrix operations?

EDIT: I studied the prepared assembly code and contains some SSE instructions. But he's still slow

EDIT: . 100 , 150 . .

  • atan2: 150ms
  • sse ( 4 4 , ): 30
  • : 90ms (diff: 36ms, hypot: 16ms, index: 17ms)

:

const Eigen::Map<const Eigen::ArrayXf, Eigen::Unaligned, Eigen::InnerStride<4> > px(&(points[0].x), points.size());
const Eigen::Map<const Eigen::ArrayXf, Eigen::Unaligned, Eigen::InnerStride<4> > py(&(points[0].y), points.size());

// difference with the origin (ox and oy are floats)
const Eigen::ArrayXf dx = px - ox, dy = py - oy;

// distance and index
const Eigen::ArrayXf d = sqrt(dx.square() + dy.square());

static const float r_res_mult = 1.0f / r_res; //2x faster than div
const Eigen::ArrayXi didx = (d * r_res_mult).cast<int>();
+4
2

, , SIMD. (xyxyxyxyxyxy...), ,

for(unsigned i=0; i<points.size(); ++i) {
    x[i] = points[i].x;
    y[i] = points[i].y;
}

(xxxxxxxx.... yyyyyyy...). .

, . aka . SSE, , , xxxxyyyyxxxxyyyy....

SIMD. Intel SVML, . AMD libm, , . . SIMD Agner Fog Vector Class Library (VCL). , Intel, AMD. , Eigen, , , Eigen, . . SSE AVX float (VLC AVX AVX).

//    g++ -O3 -Ivectorclass -msse4.2 foo.cpp
// or g++ -O3 -Ivectorclass -mavx foo.cpp
#include <vectorclass.h>
#include <vectormath_trig.h>

struct Point2DBlock {
    float x[8];
    float y[8];
};

int main(void) {
    const int nblocks = 10; //each block contains eight points
    Point2DBlock aosoa[nblocks]; //xxxxxxxxyyyyyyyy xxxxxxxxyyyyyyyy ...
    float ox = 0.0f, oy = 0.0f;
    Vec8f vox = ox, voy = oy;
    for(int i=0; i<nblocks; i++) {
        Vec8f dx = Vec8f().load(aosoa[i].x) - vox;
        Vec8f dy = Vec8f().load(aosoa[i].y) - voy;
        Vec8f d  = sqrt(dx*dx + dy*dy);
        Vec8f az = atan2(dy,dx);
    } 
}

hypot. VCL, .

static inline Vec8f hypot(Vec8f const &x, Vec8f const &y) {
    Vec8f t;
    Vec8f ax = abs(x), ay = abs(y);
    t  = min(ax,ay);
    ax = max(ax,ay);
    t  = t/ax;
    return ax*sqrt(1+t*t);
}

Edit:

, . , . VLC - .

#include <vectorclass.h>
#include <vectormath_trig.h>

int main(void) {
    const int npoints=80;
    float points[2*npoints]; //xyxyxyxyxyxy...
    float ox = 0.0, oy = 0.0;
    Vec8f vox = ox, voy = oy;
    for(int i=0; i<npoints; i+=16) {
        Vec8f l1 = Vec8f().load(&points[i+0]);
        Vec8f l2 = Vec8f().load(&points[i+8]);
        Vec8f dx = blend8f<0, 2, 4, 6, 8, 10, 12, 14>(l1,l2) - vox;
        Vec8f dy = blend8f<1, 3, 5, 7, 9, 11, 13, 15>(l1,l2) - voy;
        Vec8f d  = sqrt(dx*dx + dy*dy);
        Vec8f az = atan2(dy,dx);
    } 
}
+4

. , . . , , . . :

int sz = 15000000;
std::vector<Point> points(sz);

Eigen::Map<ArrayXd, Unaligned, InnerStride<2>> mapX(&(points[0].x), sz);
Eigen::Map<ArrayXd, Unaligned, InnerStride<2>> mapY(&(points[0].y), sz);

mapX = ArrayXd::Random(sz);
mapY = ArrayXd::Random(sz);

auto cpstart = std::chrono::high_resolution_clock::now();
ArrayXd x = mapX;
ArrayXd y = mapY;
ArrayXd d;
auto cpend = std::chrono::high_resolution_clock::now();

auto mpSumstart = std::chrono::high_resolution_clock::now();

d = (mapX.square() + mapY.square()).sqrt().eval();

auto mpSumend = std::chrono::high_resolution_clock::now();

std::cout << d.mean() << "\n";

auto arStart = std::chrono::high_resolution_clock::now();

d = (x.square() + y.square()).sqrt().eval();

auto arEnd = std::chrono::high_resolution_clock::now();

std::cout << d.mean() << "\n";

auto elapsed = cpend - cpstart;
std::cout << "Copy: " <<  elapsed.count() << '\n';
std::cout << "Map: " <<  (mpSumend - mpSumstart).count() << '\n';
std::cout << "Array: " <<  (arEnd - arStart).count() << '\n';

, , 100 , , . 90 (VS2012/Ox Release (-DNDEBUG)), 185ms, 90 . SIMD, . , .

EDIT: EIGEN_DONT_VECTORIZE, () ( ). . . Unaligned. , .

2. , . x, y std::complex<double>, :

Eigen::Map<ArrayXcd> mapC((std::complex<double>*)(&(points[0].x)), sz);
//...
cd = mapC.cwiseAbs2().sqrt().eval();

, . ,

cd = (mapC - std::complex<double>(ox, oy)).cwiseAbs2().sqrt().eval();
+3

All Articles