@@ -1489,46 +1489,45 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
14891489 cl_mem d_D = ggml_cl_pool_malloc (sizeof (float ) * d_ne, &d_size);
14901490
14911491 size_t x_offset = 0 ;
1492- int64_t pi02 = -1 ;
1493- int64_t pi03 = -1 ;
1494-
1495- for (int64_t i13 = 0 ; i13 < ne13; i13++) {
1496- int64_t i03 = i13 / r3;
1497-
1498- for (int64_t i12 = 0 ; i12 < ne12; i12++) {
1499- int64_t i02 = i12 / r2;
1500-
1501- // copy data to device
1502- if (src0->backend == GGML_BACKEND_GPU) {
1503- x_offset = (i03 * ne02 + i02) * x_ne;
1504- } else if (i02 != pi02 || i03 != pi03) {
1505- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1506- pi02 = i02;
1507- pi03 = i03;
1508- }
1509- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12, NULL ));
15101492
1511- CL_CHECK (clFinish (queue));
1493+ for (int64_t i03 = 0 ; i03 < ne03; i03++) {
1494+ // TODO: copy src0 here when r3>1
1495+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13 ; i13++) {
1496+ for (int64_t i02 = 0 ; i02 < ne02; i02++) {
1497+ if (src0->backend == GGML_BACKEND_GPU) {
1498+ x_offset = (i03 * ne02 + i02) * x_ne;
1499+ } else {
1500+ // copy src0 to device
1501+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1502+ }
15121503
1513- // compute
1514- cl_event ev_sgemm;
1515- clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor ,
1516- clblast::Transpose::kYes , clblast::Transpose::kNo ,
1517- ne01, ne11, ne10,
1518- alpha,
1519- d_X, x_offset, ne00,
1520- d_Y, 0 , ne10,
1521- beta,
1522- d_D, 0 , ne01,
1523- &queue, &ev_sgemm);
1524-
1525- if (status != clblast::StatusCode::kSuccess ) {
1526- GGML_ASSERT (false );
1527- }
1504+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12 ; i12++) {
1505+ // copy src1 to device
1506+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12, NULL ));
1507+
1508+ CL_CHECK (clFinish (queue));
1509+
1510+ // compute
1511+ cl_event ev_sgemm;
1512+ clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor ,
1513+ clblast::Transpose::kYes , clblast::Transpose::kNo ,
1514+ ne01, ne11, ne10,
1515+ alpha,
1516+ d_X, x_offset, ne00,
1517+ d_Y, 0 , ne10,
1518+ beta,
1519+ d_D, 0 , ne01,
1520+ &queue, &ev_sgemm);
1521+
1522+ if (status != clblast::StatusCode::kSuccess ) {
1523+ GGML_ASSERT (false );
1524+ }
15281525
1529- // copy dst to host
1530- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1531- CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &ev_sgemm, NULL ));
1526+ // copy dst to host
1527+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1528+ CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &ev_sgemm, NULL ));
1529+ }
1530+ }
15321531 }
15331532 }
15341533
@@ -1589,73 +1588,70 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
15891588 bool src1_cont_cols = (size_t )nb11 == ne11*sizeof (float );
15901589
15911590 size_t x_offset = 0 ;
1592- int64_t pi02 = -1 ;
1593- int64_t pi03 = -1 ;
1594-
1595- for (int64_t i13 = 0 ; i13 < ne13; i13++) {
1596- int64_t i03 = i13 / r3;
1597-
1598- for (int64_t i12 = 0 ; i12 < ne12; i12++) {
1599- int64_t i02 = i12 / r2;
16001591
1601- // copy src0 to device
1602- if (src0->backend == GGML_BACKEND_GPU) {
1603- x_offset = (i03 * ne02 + i02) * x_ne;
1604- } else if (i02 != pi02 || i03 != pi03) {
1605- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1606- pi02 = i02;
1607- pi03 = i03;
1608- }
1609-
1610- // convert src1 to fp16
1611- // TODO: use multiple threads
1612- char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1613- if (src1_cont_rows) {
1614- if (src1_cont_cols) {
1615- ggml_fp32_to_fp16_row ((float *) src1i, tmp, ne10*ne11);
1592+ for (int64_t i03 = 0 ; i03 < ne03; i03++) {
1593+ // TODO: copy src0 here when r3>1
1594+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13 ; i13++) {
1595+ for (int64_t i02 = 0 ; i02 < ne02; i02++) {
1596+ if (src0->backend == GGML_BACKEND_GPU) {
1597+ x_offset = (i03 * ne02 + i02) * x_ne;
1598+ } else {
1599+ // copy src0 to device
1600+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
16161601 }
1617- else {
1618- for (int64_t i11 = 0 ; i11 < ne11; i11++) {
1619- ggml_fp32_to_fp16_row ((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1602+
1603+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12 ; i12++) {
1604+ // convert src1 to fp16
1605+ // TODO: use multiple threads
1606+ char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
1607+ if (src1_cont_rows) {
1608+ if (src1_cont_cols) {
1609+ ggml_fp32_to_fp16_row ((float *) src1i, tmp, ne10*ne11);
1610+ }
1611+ else {
1612+ for (int64_t i11 = 0 ; i11 < ne11; i11++) {
1613+ ggml_fp32_to_fp16_row ((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
1614+ }
1615+ }
16201616 }
1621- }
1622- }
1623- else {
1624- for ( int64_t i11 = 0 ; i11 < ne11; i11++) {
1625- for ( int64_t i10 = 0 ; i10 < ne10; i10++) {
1626- // very slow due to no inlining
1627- tmp[i11*ne10 + i10] = ggml_fp32_to_fp16 (*( float *) (src1i + i11*nb11 + i10*nb10));
1617+ else {
1618+ for ( int64_t i11 = 0 ; i11 < ne11; i11++) {
1619+ for ( int64_t i10 = 0 ; i10 < ne10; i10++) {
1620+ // very slow due to no inlining
1621+ tmp[i11* ne10 + i10] = ggml_fp32_to_fp16 (*( float *) (src1i + i11*nb11 + i10*nb10));
1622+ }
1623+ }
16281624 }
1629- }
1630- }
1631-
1632- // copy src1 to device
1633- CL_CHECK (clEnqueueWriteBuffer (queue, d_Y, false , 0 , sizeof (ggml_fp16_t ) * y_ne, tmp, 0 , NULL , NULL ));
1634-
1635- CL_CHECK (clFinish (queue));
16361625
1637- // compute
1638- cl_event ev_sgemm;
1639- clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor ,
1640- clblast::Transpose::kYes , clblast::Transpose::kNo ,
1641- ne01, ne11, ne10,
1642- alpha,
1643- d_X, x_offset, ne00,
1644- d_Y, 0 , ne10,
1645- beta,
1646- d_D, 0 , ne01,
1647- &queue, &ev_sgemm);
1648-
1649- if (status != clblast::StatusCode::kSuccess ) {
1650- GGML_ASSERT (false );
1651- }
1626+ // copy src1 to device
1627+ CL_CHECK (clEnqueueWriteBuffer (queue, d_Y, false , 0 , sizeof (ggml_fp16_t ) * y_ne, tmp, 0 , NULL , NULL ));
1628+
1629+ CL_CHECK (clFinish (queue));
1630+
1631+ // compute
1632+ cl_event ev_sgemm;
1633+ clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor ,
1634+ clblast::Transpose::kYes , clblast::Transpose::kNo ,
1635+ ne01, ne11, ne10,
1636+ alpha,
1637+ d_X, x_offset, ne00,
1638+ d_Y, 0 , ne10,
1639+ beta,
1640+ d_D, 0 , ne01,
1641+ &queue, &ev_sgemm);
1642+
1643+ if (status != clblast::StatusCode::kSuccess ) {
1644+ GGML_ASSERT (false );
1645+ }
16521646
1653- // copy dst to host, then convert to float
1654- CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (ggml_fp16_t ) * d_ne, tmp, 1 , &ev_sgemm, NULL ));
1647+ // copy dst to host, then convert to float
1648+ CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (ggml_fp16_t ) * d_ne, tmp, 1 , &ev_sgemm, NULL ));
16551649
1656- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1650+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
16571651
1658- ggml_fp16_to_fp32_row (tmp, d, d_ne);
1652+ ggml_fp16_to_fp32_row (tmp, d, d_ne);
1653+ }
1654+ }
16591655 }
16601656 }
16611657
@@ -1718,85 +1714,81 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17181714 size_t ev_idx = 0 ;
17191715 std::vector<cl_event> events;
17201716
1721- int64_t pi02 = -1 ;
1722- int64_t pi03 = -1 ;
1723-
1724- for (int64_t i13 = 0 ; i13 < ne13; i13++) {
1725- int64_t i03 = i13 / r3;
1726-
1727- for (int64_t i12 = 0 ; i12 < ne12; i12++) {
1728- int64_t i02 = i12 / r2;
1729-
1730- // copy src0 to device if necessary
1731- if (src0->backend == GGML_BACKEND_CPU) {
1732- if (i02 != pi02 || i03 != pi03) {
1717+ for (int64_t i03 = 0 ; i03 < ne03; i03++) {
1718+ // TODO: copy and dequantize src0 here when r3>1
1719+ for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13 ; i13++) {
1720+ for (int64_t i02 = 0 ; i02 < ne02; i02++) {
1721+ // copy src0 to device if necessary
1722+ if (src0->backend == GGML_BACKEND_CPU) {
17331723 events.emplace_back ();
17341724 CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Q, 0 , src0, i03, i02, events.data () + ev_idx++));
1735- pi02 = i02;
1736- pi03 = i03;
1737- }
1738- } else if (src0->backend == GGML_BACKEND_GPU) {
1739- d_Q = (cl_mem) src0->extra ;
1740- } else {
1741- GGML_ASSERT (false );
1742- }
1743- if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1744- // copy src1 to device
1745- events.emplace_back ();
1746- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12, events.data () + ev_idx++));
1747-
1748- // compute
1749- const size_t global = ne01 * local;
1750- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0 ;
1751- const cl_int ncols = ne00;
1752- events.emplace_back ();
1753- CL_CHECK (clSetKernelArg (*dmmv, 0 , sizeof (cl_mem), &d_Q));
1754- CL_CHECK (clSetKernelArg (*dmmv, 1 , sizeof (float ) * local, NULL ));
1755- CL_CHECK (clSetKernelArg (*dmmv, 2 , sizeof (cl_mem), &d_Y));
1756- CL_CHECK (clSetKernelArg (*dmmv, 3 , sizeof (cl_mem), &d_D));
1757- CL_CHECK (clSetKernelArg (*dmmv, 4 , sizeof (cl_int), &ncols));
1758- CL_CHECK (clEnqueueNDRangeKernel (queue, *dmmv, 1 , &offset, &global, &local, events.size () - 1 , events.data (), events.data () + ev_idx++));
1759- } else { // general dequantization kernel + CLBlast matrix matrix multiplication
1760- // convert src0 to fp32 on device
1761- const size_t global = x_ne / global_denom;
1762- const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0 ;
1763- CL_CHECK (clSetKernelArg (*to_fp32_cl, 0 , sizeof (cl_mem), &d_Q));
1764- CL_CHECK (clSetKernelArg (*to_fp32_cl, 1 , sizeof (cl_mem), &d_X));
1765- CL_CHECK (clEnqueueNDRangeKernel (queue, *to_fp32_cl, 1 , offset > 0 ? &offset : NULL , &global, local > 0 ? &local : NULL , events.size (), !events.empty () ? events.data () : NULL , NULL ));
1766-
1767- // copy src1 to device
1768- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12, NULL ));
1769-
1770- events.emplace_back ();
1771-
1772- // wait for conversion
1773- CL_CHECK (clFinish (queue));
1774-
1775- // compute
1776- clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor ,
1777- clblast::Transpose::kYes , clblast::Transpose::kNo ,
1778- ne01, ne11, ne10,
1779- alpha,
1780- d_X, 0 , ne00,
1781- d_Y, 0 , ne10,
1782- beta,
1783- d_D, 0 , ne01,
1784- &queue, events.data () + ev_idx++);
1785-
1786- if (status != clblast::StatusCode::kSuccess ) {
1725+ } else if (src0->backend == GGML_BACKEND_GPU) {
1726+ d_Q = (cl_mem) src0->extra ;
1727+ } else {
17871728 GGML_ASSERT (false );
17881729 }
1789- }
17901730
1791- // copy dst to host
1792- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1793- CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &events[events.size () - 1 ], NULL ));
1794- for (auto *event : events) {
1795- clReleaseEvent (event);
1796- }
1731+ if (!mul_mat_vec) {
1732+ // convert src0 to fp32 on device
1733+ const size_t global = x_ne / global_denom;
1734+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0 ;
1735+ CL_CHECK (clSetKernelArg (*to_fp32_cl, 0 , sizeof (cl_mem), &d_Q));
1736+ CL_CHECK (clSetKernelArg (*to_fp32_cl, 1 , sizeof (cl_mem), &d_X));
1737+ CL_CHECK (clEnqueueNDRangeKernel (queue, *to_fp32_cl, 1 , &offset, &global, local > 0 ? &local : NULL , events.size (), !events.empty () ? events.data () : NULL , NULL ));
1738+ }
17971739
1798- ev_idx = 0 ;
1799- events.clear ();
1740+ for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12 ; i12++) {
1741+ if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1742+ // copy src1 to device
1743+ events.emplace_back ();
1744+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12, events.data () + ev_idx++));
1745+
1746+ // compute
1747+ const size_t global = ne01 * local;
1748+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0 ;
1749+ const cl_int ncols = ne00;
1750+ events.emplace_back ();
1751+ CL_CHECK (clSetKernelArg (*dmmv, 0 , sizeof (cl_mem), &d_Q));
1752+ CL_CHECK (clSetKernelArg (*dmmv, 1 , sizeof (float ) * local, NULL ));
1753+ CL_CHECK (clSetKernelArg (*dmmv, 2 , sizeof (cl_mem), &d_Y));
1754+ CL_CHECK (clSetKernelArg (*dmmv, 3 , sizeof (cl_mem), &d_D));
1755+ CL_CHECK (clSetKernelArg (*dmmv, 4 , sizeof (cl_int), &ncols));
1756+ CL_CHECK (clEnqueueNDRangeKernel (queue, *dmmv, 1 , &offset, &global, &local, events.size () - 1 , events.data (), events.data () + ev_idx++));
1757+ } else { // CLBlast matrix matrix multiplication
1758+ // copy src1 to device
1759+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12, NULL ));
1760+
1761+ // wait for conversion
1762+ CL_CHECK (clFinish (queue));
1763+
1764+ // compute
1765+ events.emplace_back ();
1766+ clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor ,
1767+ clblast::Transpose::kYes , clblast::Transpose::kNo ,
1768+ ne01, ne11, ne10,
1769+ alpha,
1770+ d_X, 0 , ne00,
1771+ d_Y, 0 , ne10,
1772+ beta,
1773+ d_D, 0 , ne01,
1774+ &queue, events.data () + ev_idx++);
1775+
1776+ if (status != clblast::StatusCode::kSuccess ) {
1777+ GGML_ASSERT (false );
1778+ }
1779+ }
1780+
1781+ // copy dst to host
1782+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
1783+ CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &events[events.size () - 1 ], NULL ));
1784+ for (auto *event : events) {
1785+ clReleaseEvent (event);
1786+ }
1787+
1788+ ev_idx = 0 ;
1789+ events.clear ();
1790+ }
1791+ }
18001792 }
18011793 }
18021794
0 commit comments