@@ -2161,6 +2161,224 @@ protected void sgemmTT(int m, int n, int k, float alpha, float[] a, int offseta,
21612161 }
21622162 }
21632163
2164+ protected void dgemvN (int m , int n , double alpha , double [] a , int offseta , int lda , double [] x , int offsetx , int incx , double beta , double [] y , int offsety , int incy ) {
2165+ if (beta != 1.0 ) {
2166+ int row = 0 , iy = incy < 0 ? (m - 1 ) * -incy : 0 ;
2167+ for (; row < m ; row += 1 , iy += incy ) {
2168+ if (beta != 0.0 ) {
2169+ y [offsety + iy ] = beta * y [offsety + iy ];
2170+ } else {
2171+ y [offsety + iy ] = 0.0 ;
2172+ }
2173+ }
2174+ }
2175+ int col = 0 , ix = incx < 0 ? (n - 1 ) * -incx : 0 ;
2176+ for (; col < loopBound (n , 4 ); col += 4 , ix += incx * 4 ) {
2177+ int row = 0 , iy = incy < 0 ? (m - 1 ) * -incy : 0 ;
2178+ double alphax0 = alpha * x [offsetx + ix + incx * 0 ];
2179+ double alphax1 = alpha * x [offsetx + ix + incx * 1 ];
2180+ double alphax2 = alpha * x [offsetx + ix + incx * 2 ];
2181+ double alphax3 = alpha * x [offsetx + ix + incx * 3 ];
2182+ for (; row < m ; row += 1 , iy += incy ) {
2183+ y [offsety + iy ] = Math .fma (alphax0 , a [offseta + row + (col + 0 ) * lda ],
2184+ Math .fma (alphax1 , a [offseta + row + (col + 1 ) * lda ],
2185+ Math .fma (alphax2 , a [offseta + row + (col + 2 ) * lda ],
2186+ Math .fma (alphax3 , a [offseta + row + (col + 3 ) * lda ], y [offsety + iy ]))));
2187+ }
2188+ }
2189+ for (; col < n ; col += 1 , ix += incx ) {
2190+ int row = 0 , iy = incy < 0 ? (m - 1 ) * -incy : 0 ;
2191+ double alphax = alpha * x [offsetx + ix ];
2192+ for (; row < m ; row += 1 , iy += incy ) {
2193+ y [offsety + iy ] = Math .fma (alphax , a [offseta + row + col * lda ], y [offsety + iy ]);
2194+ }
2195+ }
2196+ }
2197+
2198+ protected void dgemvT (int m , int n , double alpha , double [] a , int offseta , int lda , double [] x , int offsetx , int incx , double beta , double [] y , int offsety , int incy ) {
2199+ int col = 0 , iy = incy < 0 ? (n - 1 ) * -incy : 0 ;
2200+ for (; col < loopBound (n , 4 ); col += 4 , iy += incy * 4 ) {
2201+ int row = 0 , ix = incx < 0 ? (m - 1 ) * -incx : 0 ;
2202+ double sum0 = 0.0 ;
2203+ double sum1 = 0.0 ;
2204+ double sum2 = 0.0 ;
2205+ double sum3 = 0.0 ;
2206+ for (; row < m ; row += 1 , ix += incx ) {
2207+ double xix = x [offsetx + ix ];
2208+ sum0 = Math .fma (xix , a [offseta + row + (col + 0 ) * lda ], sum0 );
2209+ sum1 = Math .fma (xix , a [offseta + row + (col + 1 ) * lda ], sum1 );
2210+ sum2 = Math .fma (xix , a [offseta + row + (col + 2 ) * lda ], sum2 );
2211+ sum3 = Math .fma (xix , a [offseta + row + (col + 3 ) * lda ], sum3 );
2212+ }
2213+ if (beta != 0.0 ) {
2214+ y [offsety + iy + incy * 0 ] = alpha * sum0 + beta * y [offsety + iy + incy * 0 ];
2215+ y [offsety + iy + incy * 1 ] = alpha * sum1 + beta * y [offsety + iy + incy * 1 ];
2216+ y [offsety + iy + incy * 2 ] = alpha * sum2 + beta * y [offsety + iy + incy * 2 ];
2217+ y [offsety + iy + incy * 3 ] = alpha * sum3 + beta * y [offsety + iy + incy * 3 ];
2218+ } else {
2219+ y [offsety + iy + incy * 0 ] = alpha * sum0 ;
2220+ y [offsety + iy + incy * 1 ] = alpha * sum1 ;
2221+ y [offsety + iy + incy * 2 ] = alpha * sum2 ;
2222+ y [offsety + iy + incy * 3 ] = alpha * sum3 ;
2223+ }
2224+ }
2225+ for (; col < n ; col += 1 , iy += incy ) {
2226+ int row = 0 , ix = incx < 0 ? (m - 1 ) * -incx : 0 ;
2227+ double sum = 0.0 ;
2228+ for (; row < m ; row += 1 , ix += incx ) {
2229+ sum = Math .fma (x [offsetx + ix ], a [offseta + row + col * lda ], sum );
2230+ }
2231+ if (beta != 0.0 ) {
2232+ y [offsety + iy ] = alpha * sum + beta * y [offsety + iy ];
2233+ } else {
2234+ y [offsety + iy ] = alpha * sum ;
2235+ }
2236+ }
2237+ }
2238+
2239+ protected void sgemvN (int m , int n , float alpha , float [] a , int offseta , int lda , float [] x , int offsetx , int incx , float beta , float [] y , int offsety , int incy ) {
2240+ // y = beta * y
2241+ for (int row = 0 , iy = incy < 0 ? (m - 1 ) * -incy : 0 ; row < m ; row += 1 , iy += incy ) {
2242+ if (beta != 0.0f ) {
2243+ y [offsety + iy ] = beta * y [offsety + iy ];
2244+ } else {
2245+ y [offsety + iy ] = 0.0f ;
2246+ }
2247+ }
2248+ // y += alpha * A * x
2249+ int col = 0 , ix = incx < 0 ? (n - 1 ) * -incx : 0 ;
2250+ for (; col < loopBound (n , 8 ); col += 8 , ix += incx * 8 ) {
2251+ float alphax0 = alpha * x [offsetx + ix + incx * 0 ];
2252+ float alphax1 = alpha * x [offsetx + ix + incx * 1 ];
2253+ float alphax2 = alpha * x [offsetx + ix + incx * 2 ];
2254+ float alphax3 = alpha * x [offsetx + ix + incx * 3 ];
2255+ float alphax4 = alpha * x [offsetx + ix + incx * 4 ];
2256+ float alphax5 = alpha * x [offsetx + ix + incx * 5 ];
2257+ float alphax6 = alpha * x [offsetx + ix + incx * 6 ];
2258+ float alphax7 = alpha * x [offsetx + ix + incx * 7 ];
2259+ for (int row = 0 , iy = incy < 0 ? (m - 1 ) * -incy : 0 ; row < m ; row += 1 , iy += incy ) {
2260+ y [offsety + iy ] = Math .fma (alphax0 , a [offseta + row + (col + 0 ) * lda ],
2261+ Math .fma (alphax1 , a [offseta + row + (col + 1 ) * lda ],
2262+ Math .fma (alphax2 , a [offseta + row + (col + 2 ) * lda ],
2263+ Math .fma (alphax3 , a [offseta + row + (col + 3 ) * lda ],
2264+ Math .fma (alphax4 , a [offseta + row + (col + 4 ) * lda ],
2265+ Math .fma (alphax5 , a [offseta + row + (col + 5 ) * lda ],
2266+ Math .fma (alphax6 , a [offseta + row + (col + 6 ) * lda ],
2267+ Math .fma (alphax7 , a [offseta + row + (col + 7 ) * lda ], y [offsety + iy ]))))))));
2268+ }
2269+ }
2270+ for (; col < n ; col += 1 , ix += incx ) {
2271+ float alphax = alpha * x [offsetx + ix ];
2272+ for (int row = 0 , iy = incy < 0 ? (m - 1 ) * -incy : 0 ; row < m ; row += 1 , iy += incy ) {
2273+ y [offsety + iy ] = Math .fma (alphax , a [offseta + row + col * lda ], y [offsety + iy ]);
2274+ }
2275+ }
2276+ }
2277+
2278+ protected void sgemvT (int m , int n , float alpha , float [] a , int offseta , int lda , float [] x , int offsetx , int incx , float beta , float [] y , int offsety , int incy ) {
2279+ int col = 0 , iy = incy < 0 ? (n - 1 ) * -incy : 0 ;
2280+ for (; col < loopBound (n , 8 ); col += 8 , iy += incy * 8 ) {
2281+ float sum0 = 0.0f ;
2282+ float sum1 = 0.0f ;
2283+ float sum2 = 0.0f ;
2284+ float sum3 = 0.0f ;
2285+ float sum4 = 0.0f ;
2286+ float sum5 = 0.0f ;
2287+ float sum6 = 0.0f ;
2288+ float sum7 = 0.0f ;
2289+ for (int row = 0 , ix = incx < 0 ? (m - 1 ) * -incx : 0 ; row < m ; row += 1 , ix += incx ) {
2290+ sum0 = Math .fma (x [offsetx + ix ], a [offseta + row + (col + 0 ) * lda ], sum0 );
2291+ sum1 = Math .fma (x [offsetx + ix ], a [offseta + row + (col + 1 ) * lda ], sum1 );
2292+ sum2 = Math .fma (x [offsetx + ix ], a [offseta + row + (col + 2 ) * lda ], sum2 );
2293+ sum3 = Math .fma (x [offsetx + ix ], a [offseta + row + (col + 3 ) * lda ], sum3 );
2294+ sum4 = Math .fma (x [offsetx + ix ], a [offseta + row + (col + 4 ) * lda ], sum4 );
2295+ sum5 = Math .fma (x [offsetx + ix ], a [offseta + row + (col + 5 ) * lda ], sum5 );
2296+ sum6 = Math .fma (x [offsetx + ix ], a [offseta + row + (col + 6 ) * lda ], sum6 );
2297+ sum7 = Math .fma (x [offsetx + ix ], a [offseta + row + (col + 7 ) * lda ], sum7 );
2298+ }
2299+ if (beta != 0.0f ) {
2300+ y [offsety + iy + incy * 0 ] = alpha * sum0 + beta * y [offsety + iy + incy * 0 ];
2301+ y [offsety + iy + incy * 1 ] = alpha * sum1 + beta * y [offsety + iy + incy * 1 ];
2302+ y [offsety + iy + incy * 2 ] = alpha * sum2 + beta * y [offsety + iy + incy * 2 ];
2303+ y [offsety + iy + incy * 3 ] = alpha * sum3 + beta * y [offsety + iy + incy * 3 ];
2304+ y [offsety + iy + incy * 4 ] = alpha * sum4 + beta * y [offsety + iy + incy * 4 ];
2305+ y [offsety + iy + incy * 5 ] = alpha * sum5 + beta * y [offsety + iy + incy * 5 ];
2306+ y [offsety + iy + incy * 6 ] = alpha * sum6 + beta * y [offsety + iy + incy * 6 ];
2307+ y [offsety + iy + incy * 7 ] = alpha * sum7 + beta * y [offsety + iy + incy * 7 ];
2308+ } else {
2309+ y [offsety + iy + incy * 0 ] = alpha * sum0 ;
2310+ y [offsety + iy + incy * 1 ] = alpha * sum1 ;
2311+ y [offsety + iy + incy * 2 ] = alpha * sum2 ;
2312+ y [offsety + iy + incy * 3 ] = alpha * sum3 ;
2313+ y [offsety + iy + incy * 4 ] = alpha * sum4 ;
2314+ y [offsety + iy + incy * 5 ] = alpha * sum5 ;
2315+ y [offsety + iy + incy * 6 ] = alpha * sum6 ;
2316+ y [offsety + iy + incy * 7 ] = alpha * sum7 ;
2317+ }
2318+ }
2319+ for (; col < n ; col += 1 , iy += incy ) {
2320+ float sum = 0.0f ;
2321+ for (int row = 0 , ix = incx < 0 ? (m - 1 ) * -incx : 0 ; row < m ; row += 1 , ix += incx ) {
2322+ sum = Math .fma (x [offsetx + ix ], a [offseta + row + col * lda ], sum );
2323+ }
2324+ if (beta != 0.0f ) {
2325+ y [offsety + iy ] = alpha * sum + beta * y [offsety + iy ];
2326+ } else {
2327+ y [offsety + iy ] = alpha * sum ;
2328+ }
2329+ }
2330+ }
2331+
2332+ protected void dgerK (int m , int n , double alpha , double [] x , int offsetx , int incx , double [] y , int offsety , int incy , double [] a , int offseta , int lda ) {
2333+ int col = 0 , iy = incy < 0 ? (n - 1 ) * -incy : 0 ;
2334+ for (; col < loopBound (n , 4 ); col += 4 , iy += incy * 4 ) {
2335+ double alphayiy0 = alpha * y [offsety + iy + incy * 0 ];
2336+ double alphayiy1 = alpha * y [offsety + iy + incy * 1 ];
2337+ double alphayiy2 = alpha * y [offsety + iy + incy * 2 ];
2338+ double alphayiy3 = alpha * y [offsety + iy + incy * 3 ];
2339+ int row = 0 , jx = incx < 0 ? (n - 1 ) * -incx : 0 ;
2340+ for (; row < m ; row += 1 , jx += incx ) {
2341+ double xjx = x [offsetx + jx ];
2342+ a [offseta + row + (col + 0 ) * lda ] = Math .fma (alphayiy0 , xjx , a [offseta + row + (col + 0 ) * lda ]);
2343+ a [offseta + row + (col + 1 ) * lda ] = Math .fma (alphayiy1 , xjx , a [offseta + row + (col + 1 ) * lda ]);
2344+ a [offseta + row + (col + 2 ) * lda ] = Math .fma (alphayiy2 , xjx , a [offseta + row + (col + 2 ) * lda ]);
2345+ a [offseta + row + (col + 3 ) * lda ] = Math .fma (alphayiy3 , xjx , a [offseta + row + (col + 3 ) * lda ]);
2346+ }
2347+ }
2348+ for (; col < n ; col += 1 , iy += incy ) {
2349+ double alphayiy = alpha * y [offsety + iy ];
2350+ int row = 0 , jx = incx < 0 ? (n - 1 ) * -incx : 0 ;
2351+ for (; row < m ; row += 1 , jx += incx ) {
2352+ a [offseta + row + col * lda ] = Math .fma (alphayiy , x [offsetx + jx ], a [offseta + row + col * lda ]);
2353+ }
2354+ }
2355+ }
2356+
2357+ protected void sgerK (int m , int n , float alpha , float [] x , int offsetx , int incx , float [] y , int offsety , int incy , float [] a , int offseta , int lda ) {
2358+ int col = 0 , iy = incy < 0 ? (n - 1 ) * -incy : 0 ;
2359+ for (; col < loopBound (n , 4 ); col += 4 , iy += incy * 4 ) {
2360+ float alphayiy0 = alpha * y [offsety + iy + incy * 0 ];
2361+ float alphayiy1 = alpha * y [offsety + iy + incy * 1 ];
2362+ float alphayiy2 = alpha * y [offsety + iy + incy * 2 ];
2363+ float alphayiy3 = alpha * y [offsety + iy + incy * 3 ];
2364+ int row = 0 , jx = incx < 0 ? (n - 1 ) * -incx : 0 ;
2365+ for (; row < m ; row += 1 , jx += incx ) {
2366+ float xjx = x [offsetx + jx ];
2367+ a [offseta + row + (col + 0 ) * lda ] = Math .fma (alphayiy0 , xjx , a [offseta + row + (col + 0 ) * lda ]);
2368+ a [offseta + row + (col + 1 ) * lda ] = Math .fma (alphayiy1 , xjx , a [offseta + row + (col + 1 ) * lda ]);
2369+ a [offseta + row + (col + 2 ) * lda ] = Math .fma (alphayiy2 , xjx , a [offseta + row + (col + 2 ) * lda ]);
2370+ a [offseta + row + (col + 3 ) * lda ] = Math .fma (alphayiy3 , xjx , a [offseta + row + (col + 3 ) * lda ]);
2371+ }
2372+ }
2373+ for (; col < n ; col += 1 , iy += incy ) {
2374+ float alphayiy = alpha * y [offsety + iy ];
2375+ int row = 0 , jx = incx < 0 ? (n - 1 ) * -incx : 0 ;
2376+ for (; row < m ; row += 1 , jx += incx ) {
2377+ a [offseta + row + col * lda ] = Math .fma (alphayiy , x [offsetx + jx ], a [offseta + row + col * lda ]);
2378+ }
2379+ }
2380+ }
2381+
21642382 protected double dnrm2K (int n , double [] x , int offsetx , int incx ) {
21652383 int ix = 0 ;
21662384 double sum0 = 0.0 ;
0 commit comments