I cannot figure out how to implement:
__m256d min(__m256d A, __m256d B, __m256d C, __m256d D)
{
__m256d result;
// result should contain 4 minimal values out of 16 : A[0], A[1], A[2], A[3], B[0], ... , D[3]
// moreover it should be result[0] <= result[1] <= result[2] <= result[2]
return result;
}
Any ideas of how to use _mm256_min_pd, _mm256_max_pd and shuffles/permutes in a smart way?
==================================================
This where I got so far, after:
__m256d T = _mm256_min_pd(A, B);
__m256d Q = _mm256_max_pd(A, B);
A = T; B = Q;
T = _mm256_min_pd(C, D);
Q = _mm256_max_pd(C, D);
C = T; D = Q;
T = _mm256_min_pd(B, C);
Q = _mm256_max_pd(B, C);
B = T; C = Q;
T = _mm256_min_pd(A, B);
Q = _mm256_max_pd(A, B);
A = T; D = Q;
T = _mm256_min_pd(C, D);
Q = _mm256_max_pd(C, D);
C = T; D = Q;
T = _mm256_min_pd(B, C);
Q = _mm256_max_pd(B, C);
B = T; C = Q;
we have : A[0] < B[0] < C[0] < D[0], A[1] < B[1] < C[1] < D[1], A[2] < B[2] < C[2] < D[2], A[3] < B[3] < C[3] < D[3],
so the minimal value is among A's, second minimal is among A's or B's, ... Not sure where to go from there ...
========================================================
Second idea is that the problem is reducible to itself, but with 2 input __m256 elements. If this can be done, then just do min4(A,B) --> P, min4(C,D) --> Q, min4(P,Q) --> return value.
No idea how to that for two vectors though :)
=======================================================================
Update 2 : problem almost solved -- the following function computes 4 minimal values.
__m256d min4(__m256d A, __m256d B, __m256d C, __m256d D)
{
__m256d T;
T = _mm256_min_pd(A, B);
B = _mm256_max_pd(A, B);
B = _mm256_permute_pd(B, 0x5);
A = _mm256_min_pd(T, B);
B = _mm256_max_pd(T, B);
B = _mm256_permute2f128_pd(B, B, 0x1);
T = _mm256_min_pd(A, B);
B = _mm256_max_pd(A, B);
B = _mm256_permute_pd(B, 0x5);
A = _mm256_min_pd(A, B);
T = _mm256_min_pd(C, D);
D = _mm256_max_pd(C, D);
D = _mm256_permute_pd(D, 0x5);
C = _mm256_min_pd(T, D);
D = _mm256_max_pd(T, D);
D = _mm256_permute2f128_pd(D, D, 0x1);
T = _mm256_min_pd(C, D);
D = _mm256_max_pd(C, D);
D = _mm256_permute_pd(D, 0x5);
C = _mm256_min_pd(C, D);
T = _mm256_min_pd(A, C);
C = _mm256_max_pd(A, C);
C = _mm256_permute_pd(C, 0x5);
A = _mm256_min_pd(T, C);
C = _mm256_max_pd(T, C);
C = _mm256_permute2f128_pd(C, C, 0x1);
T = _mm256_min_pd(A, C);
C = _mm256_max_pd(A, C);
C = _mm256_permute_pd(C, 0x5);
A = _mm256_min_pd(A, C);
return A;
};
All that remains is to sort the values in increasing order inside A before return.