Skip to content

Commit 90d25c6

Browse files
committed
Accelerate some Level-2 operations with Math.fma for Java 11
1 parent 133fdca commit 90d25c6

File tree

1 file changed

+218
-0
lines changed

1 file changed

+218
-0
lines changed

blas/src/main/java/dev/ludovic/netlib/blas/Java11BLAS.java

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2161,6 +2161,224 @@ protected void sgemmTT(int m, int n, int k, float alpha, float[] a, int offseta,
21612161
}
21622162
}
21632163

2164+
protected void dgemvN(int m, int n, double alpha, double[] a, int offseta, int lda, double[] x, int offsetx, int incx, double beta, double[] y, int offsety, int incy) {
2165+
if (beta != 1.0) {
2166+
int row = 0, iy = incy < 0 ? (m - 1) * -incy : 0;
2167+
for (; row < m; row += 1, iy += incy) {
2168+
if (beta != 0.0) {
2169+
y[offsety + iy] = beta * y[offsety + iy];
2170+
} else {
2171+
y[offsety + iy] = 0.0;
2172+
}
2173+
}
2174+
}
2175+
int col = 0, ix = incx < 0 ? (n - 1) * -incx : 0;
2176+
for (; col < loopBound(n, 4); col += 4, ix += incx * 4) {
2177+
int row = 0, iy = incy < 0 ? (m - 1) * -incy : 0;
2178+
double alphax0 = alpha * x[offsetx + ix + incx * 0];
2179+
double alphax1 = alpha * x[offsetx + ix + incx * 1];
2180+
double alphax2 = alpha * x[offsetx + ix + incx * 2];
2181+
double alphax3 = alpha * x[offsetx + ix + incx * 3];
2182+
for (; row < m; row += 1, iy += incy) {
2183+
y[offsety + iy] = Math.fma(alphax0, a[offseta + row + (col + 0) * lda],
2184+
Math.fma(alphax1, a[offseta + row + (col + 1) * lda],
2185+
Math.fma(alphax2, a[offseta + row + (col + 2) * lda],
2186+
Math.fma(alphax3, a[offseta + row + (col + 3) * lda], y[offsety + iy]))));
2187+
}
2188+
}
2189+
for (; col < n; col += 1, ix += incx) {
2190+
int row = 0, iy = incy < 0 ? (m - 1) * -incy : 0;
2191+
double alphax = alpha * x[offsetx + ix];
2192+
for (; row < m; row += 1, iy += incy) {
2193+
y[offsety + iy] = Math.fma(alphax, a[offseta + row + col * lda], y[offsety + iy]);
2194+
}
2195+
}
2196+
}
2197+
2198+
protected void dgemvT(int m, int n, double alpha, double[] a, int offseta, int lda, double[] x, int offsetx, int incx, double beta, double[] y, int offsety, int incy) {
2199+
int col = 0, iy = incy < 0 ? (n - 1) * -incy : 0;
2200+
for (; col < loopBound(n, 4); col += 4, iy += incy * 4) {
2201+
int row = 0, ix = incx < 0 ? (m - 1) * -incx : 0;
2202+
double sum0 = 0.0;
2203+
double sum1 = 0.0;
2204+
double sum2 = 0.0;
2205+
double sum3 = 0.0;
2206+
for (; row < m; row += 1, ix += incx) {
2207+
double xix = x[offsetx + ix];
2208+
sum0 = Math.fma(xix, a[offseta + row + (col + 0) * lda], sum0);
2209+
sum1 = Math.fma(xix, a[offseta + row + (col + 1) * lda], sum1);
2210+
sum2 = Math.fma(xix, a[offseta + row + (col + 2) * lda], sum2);
2211+
sum3 = Math.fma(xix, a[offseta + row + (col + 3) * lda], sum3);
2212+
}
2213+
if (beta != 0.0) {
2214+
y[offsety + iy + incy * 0] = alpha * sum0 + beta * y[offsety + iy + incy * 0];
2215+
y[offsety + iy + incy * 1] = alpha * sum1 + beta * y[offsety + iy + incy * 1];
2216+
y[offsety + iy + incy * 2] = alpha * sum2 + beta * y[offsety + iy + incy * 2];
2217+
y[offsety + iy + incy * 3] = alpha * sum3 + beta * y[offsety + iy + incy * 3];
2218+
} else {
2219+
y[offsety + iy + incy * 0] = alpha * sum0;
2220+
y[offsety + iy + incy * 1] = alpha * sum1;
2221+
y[offsety + iy + incy * 2] = alpha * sum2;
2222+
y[offsety + iy + incy * 3] = alpha * sum3;
2223+
}
2224+
}
2225+
for (; col < n; col += 1, iy += incy) {
2226+
int row = 0, ix = incx < 0 ? (m - 1) * -incx : 0;
2227+
double sum = 0.0;
2228+
for (; row < m; row += 1, ix += incx) {
2229+
sum = Math.fma(x[offsetx + ix], a[offseta + row + col * lda], sum);
2230+
}
2231+
if (beta != 0.0) {
2232+
y[offsety + iy] = alpha * sum + beta * y[offsety + iy];
2233+
} else {
2234+
y[offsety + iy] = alpha * sum;
2235+
}
2236+
}
2237+
}
2238+
2239+
protected void sgemvN(int m, int n, float alpha, float[] a, int offseta, int lda, float[] x, int offsetx, int incx, float beta, float[] y, int offsety, int incy) {
2240+
// y = beta * y
2241+
for (int row = 0, iy = incy < 0 ? (m - 1) * -incy : 0; row < m; row += 1, iy += incy) {
2242+
if (beta != 0.0f) {
2243+
y[offsety + iy] = beta * y[offsety + iy];
2244+
} else {
2245+
y[offsety + iy] = 0.0f;
2246+
}
2247+
}
2248+
// y += alpha * A * x
2249+
int col = 0, ix = incx < 0 ? (n - 1) * -incx : 0;
2250+
for (; col < loopBound(n, 8); col += 8, ix += incx * 8) {
2251+
float alphax0 = alpha * x[offsetx + ix + incx * 0];
2252+
float alphax1 = alpha * x[offsetx + ix + incx * 1];
2253+
float alphax2 = alpha * x[offsetx + ix + incx * 2];
2254+
float alphax3 = alpha * x[offsetx + ix + incx * 3];
2255+
float alphax4 = alpha * x[offsetx + ix + incx * 4];
2256+
float alphax5 = alpha * x[offsetx + ix + incx * 5];
2257+
float alphax6 = alpha * x[offsetx + ix + incx * 6];
2258+
float alphax7 = alpha * x[offsetx + ix + incx * 7];
2259+
for (int row = 0, iy = incy < 0 ? (m - 1) * -incy : 0; row < m; row += 1, iy += incy) {
2260+
y[offsety + iy] = Math.fma(alphax0, a[offseta + row + (col + 0) * lda],
2261+
Math.fma(alphax1, a[offseta + row + (col + 1) * lda],
2262+
Math.fma(alphax2, a[offseta + row + (col + 2) * lda],
2263+
Math.fma(alphax3, a[offseta + row + (col + 3) * lda],
2264+
Math.fma(alphax4, a[offseta + row + (col + 4) * lda],
2265+
Math.fma(alphax5, a[offseta + row + (col + 5) * lda],
2266+
Math.fma(alphax6, a[offseta + row + (col + 6) * lda],
2267+
Math.fma(alphax7, a[offseta + row + (col + 7) * lda], y[offsety + iy]))))))));
2268+
}
2269+
}
2270+
for (; col < n; col += 1, ix += incx) {
2271+
float alphax = alpha * x[offsetx + ix];
2272+
for (int row = 0, iy = incy < 0 ? (m - 1) * -incy : 0; row < m; row += 1, iy += incy) {
2273+
y[offsety + iy] = Math.fma(alphax, a[offseta + row + col * lda], y[offsety + iy]);
2274+
}
2275+
}
2276+
}
2277+
2278+
protected void sgemvT(int m, int n, float alpha, float[] a, int offseta, int lda, float[] x, int offsetx, int incx, float beta, float[] y, int offsety, int incy) {
2279+
int col = 0, iy = incy < 0 ? (n - 1) * -incy : 0;
2280+
for (; col < loopBound(n, 8); col += 8, iy += incy * 8) {
2281+
float sum0 = 0.0f;
2282+
float sum1 = 0.0f;
2283+
float sum2 = 0.0f;
2284+
float sum3 = 0.0f;
2285+
float sum4 = 0.0f;
2286+
float sum5 = 0.0f;
2287+
float sum6 = 0.0f;
2288+
float sum7 = 0.0f;
2289+
for (int row = 0, ix = incx < 0 ? (m - 1) * -incx : 0; row < m; row += 1, ix += incx) {
2290+
sum0 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 0) * lda], sum0);
2291+
sum1 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 1) * lda], sum1);
2292+
sum2 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 2) * lda], sum2);
2293+
sum3 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 3) * lda], sum3);
2294+
sum4 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 4) * lda], sum4);
2295+
sum5 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 5) * lda], sum5);
2296+
sum6 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 6) * lda], sum6);
2297+
sum7 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 7) * lda], sum7);
2298+
}
2299+
if (beta != 0.0f) {
2300+
y[offsety + iy + incy * 0] = alpha * sum0 + beta * y[offsety + iy + incy * 0];
2301+
y[offsety + iy + incy * 1] = alpha * sum1 + beta * y[offsety + iy + incy * 1];
2302+
y[offsety + iy + incy * 2] = alpha * sum2 + beta * y[offsety + iy + incy * 2];
2303+
y[offsety + iy + incy * 3] = alpha * sum3 + beta * y[offsety + iy + incy * 3];
2304+
y[offsety + iy + incy * 4] = alpha * sum4 + beta * y[offsety + iy + incy * 4];
2305+
y[offsety + iy + incy * 5] = alpha * sum5 + beta * y[offsety + iy + incy * 5];
2306+
y[offsety + iy + incy * 6] = alpha * sum6 + beta * y[offsety + iy + incy * 6];
2307+
y[offsety + iy + incy * 7] = alpha * sum7 + beta * y[offsety + iy + incy * 7];
2308+
} else {
2309+
y[offsety + iy + incy * 0] = alpha * sum0;
2310+
y[offsety + iy + incy * 1] = alpha * sum1;
2311+
y[offsety + iy + incy * 2] = alpha * sum2;
2312+
y[offsety + iy + incy * 3] = alpha * sum3;
2313+
y[offsety + iy + incy * 4] = alpha * sum4;
2314+
y[offsety + iy + incy * 5] = alpha * sum5;
2315+
y[offsety + iy + incy * 6] = alpha * sum6;
2316+
y[offsety + iy + incy * 7] = alpha * sum7;
2317+
}
2318+
}
2319+
for (; col < n; col += 1, iy += incy) {
2320+
float sum = 0.0f;
2321+
for (int row = 0, ix = incx < 0 ? (m - 1) * -incx : 0; row < m; row += 1, ix += incx) {
2322+
sum = Math.fma(x[offsetx + ix], a[offseta + row + col * lda], sum);
2323+
}
2324+
if (beta != 0.0f) {
2325+
y[offsety + iy] = alpha * sum + beta * y[offsety + iy];
2326+
} else {
2327+
y[offsety + iy] = alpha * sum;
2328+
}
2329+
}
2330+
}
2331+
2332+
protected void dgerK(int m, int n, double alpha, double[] x, int offsetx, int incx, double[] y, int offsety, int incy, double[] a, int offseta, int lda) {
2333+
int col = 0, iy = incy < 0 ? (n - 1) * -incy : 0;
2334+
for (; col < loopBound(n, 4); col += 4, iy += incy * 4) {
2335+
double alphayiy0 = alpha * y[offsety + iy + incy * 0];
2336+
double alphayiy1 = alpha * y[offsety + iy + incy * 1];
2337+
double alphayiy2 = alpha * y[offsety + iy + incy * 2];
2338+
double alphayiy3 = alpha * y[offsety + iy + incy * 3];
2339+
int row = 0, jx = incx < 0 ? (n - 1) * -incx : 0;
2340+
for (; row < m; row += 1, jx += incx) {
2341+
double xjx = x[offsetx + jx];
2342+
a[offseta + row + (col + 0) * lda] = Math.fma(alphayiy0, xjx, a[offseta + row + (col + 0) * lda]);
2343+
a[offseta + row + (col + 1) * lda] = Math.fma(alphayiy1, xjx, a[offseta + row + (col + 1) * lda]);
2344+
a[offseta + row + (col + 2) * lda] = Math.fma(alphayiy2, xjx, a[offseta + row + (col + 2) * lda]);
2345+
a[offseta + row + (col + 3) * lda] = Math.fma(alphayiy3, xjx, a[offseta + row + (col + 3) * lda]);
2346+
}
2347+
}
2348+
for (; col < n; col += 1, iy += incy) {
2349+
double alphayiy = alpha * y[offsety + iy];
2350+
int row = 0, jx = incx < 0 ? (n - 1) * -incx : 0;
2351+
for (; row < m; row += 1, jx += incx) {
2352+
a[offseta + row + col * lda] = Math.fma(alphayiy, x[offsetx + jx], a[offseta + row + col * lda]);
2353+
}
2354+
}
2355+
}
2356+
2357+
protected void sgerK(int m, int n, float alpha, float[] x, int offsetx, int incx, float[] y, int offsety, int incy, float[] a, int offseta, int lda) {
2358+
int col = 0, iy = incy < 0 ? (n - 1) * -incy : 0;
2359+
for (; col < loopBound(n, 4); col += 4, iy += incy * 4) {
2360+
float alphayiy0 = alpha * y[offsety + iy + incy * 0];
2361+
float alphayiy1 = alpha * y[offsety + iy + incy * 1];
2362+
float alphayiy2 = alpha * y[offsety + iy + incy * 2];
2363+
float alphayiy3 = alpha * y[offsety + iy + incy * 3];
2364+
int row = 0, jx = incx < 0 ? (n - 1) * -incx : 0;
2365+
for (; row < m; row += 1, jx += incx) {
2366+
float xjx = x[offsetx + jx];
2367+
a[offseta + row + (col + 0) * lda] = Math.fma(alphayiy0, xjx, a[offseta + row + (col + 0) * lda]);
2368+
a[offseta + row + (col + 1) * lda] = Math.fma(alphayiy1, xjx, a[offseta + row + (col + 1) * lda]);
2369+
a[offseta + row + (col + 2) * lda] = Math.fma(alphayiy2, xjx, a[offseta + row + (col + 2) * lda]);
2370+
a[offseta + row + (col + 3) * lda] = Math.fma(alphayiy3, xjx, a[offseta + row + (col + 3) * lda]);
2371+
}
2372+
}
2373+
for (; col < n; col += 1, iy += incy) {
2374+
float alphayiy = alpha * y[offsety + iy];
2375+
int row = 0, jx = incx < 0 ? (n - 1) * -incx : 0;
2376+
for (; row < m; row += 1, jx += incx) {
2377+
a[offseta + row + col * lda] = Math.fma(alphayiy, x[offsetx + jx], a[offseta + row + col * lda]);
2378+
}
2379+
}
2380+
}
2381+
21642382
protected double dnrm2K(int n, double[] x, int offsetx, int incx) {
21652383
int ix = 0;
21662384
double sum0 = 0.0;

0 commit comments

Comments
 (0)