@@ -480,6 +480,7 @@ struct llama_file_loader {
480480 case GGML_TYPE_F32:
481481 case GGML_TYPE_F16:
482482 case GGML_TYPE_Q4_0:
483+ case GGML_TYPE_Q4_0C:
483484 case GGML_TYPE_Q4_1:
484485 case GGML_TYPE_Q4_2:
485486 case GGML_TYPE_Q4_3:
@@ -554,6 +555,7 @@ struct llama_file_saver {
554555 case GGML_TYPE_F32:
555556 case GGML_TYPE_F16:
556557 case GGML_TYPE_Q4_0:
558+ case GGML_TYPE_Q4_0C:
557559 case GGML_TYPE_Q4_1:
558560 case GGML_TYPE_Q4_2:
559561 case GGML_TYPE_Q4_3:
@@ -842,6 +844,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
842844 case LLAMA_FTYPE_ALL_F32: return " all F32" ;
843845 case LLAMA_FTYPE_MOSTLY_F16: return " mostly F16" ;
844846 case LLAMA_FTYPE_MOSTLY_Q4_0: return " mostly Q4_0" ;
847+ case LLAMA_FTYPE_MOSTLY_Q4_0C: return " mostly Q4_0C" ;
845848 case LLAMA_FTYPE_MOSTLY_Q4_1: return " mostly Q4_1" ;
846849 case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
847850 return " mostly Q4_1, some F16" ;
@@ -1579,6 +1582,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
15791582 ggml_type quantized_type;
15801583 switch (ftype) {
15811584 case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break ;
1585+ case LLAMA_FTYPE_MOSTLY_Q4_0C: quantized_type = GGML_TYPE_Q4_0C; break ;
15821586 case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break ;
15831587 case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break ;
15841588 case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break ;
@@ -1658,15 +1662,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16581662 new_data = work.addr ;
16591663 std::vector<int64_t > hist_cur (1 << 4 , 0 );
16601664
1661- int chunk_size = 32 * 512 ;
1665+ int row_size = tensor.ne .at (0 );
1666+ int chunk_size = ceil (32 * 512 * 1.0 / row_size) * row_size;
16621667 const int nchunk = (nelements + chunk_size - 1 )/chunk_size;
16631668 const int nthread_use = nthread > 1 ? std::max (1 , std::min (nthread, nchunk)) : 1 ;
16641669 if (nthread_use < 2 ) {
1665- new_size = ggml_quantize_chunk (new_type, f32_data, new_data, 0 , nelements, hist_cur.data ());
1670+ new_size = ggml_quantize_chunk (new_type, f32_data, new_data, 0 , nelements, row_size, hist_cur.data ());
16661671 } else {
16671672 size_t counter = 0 ;
16681673 new_size = 0 ;
1669- auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
1674+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size, row_size ] () {
16701675 std::vector<int64_t > local_hist;
16711676 size_t local_size = 0 ;
16721677 while (true ) {
@@ -1682,7 +1687,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
16821687 lock.unlock ();
16831688 size_t last = std::min (nelements, first + chunk_size);
16841689 if (local_hist.empty ()) local_hist.resize (hist_cur.size (), 0 );
1685- local_size += ggml_quantize_chunk (new_type, f32_data, new_data, first, last - first, local_hist.data ());
1690+ local_size += ggml_quantize_chunk (new_type, f32_data, new_data, first, last - first, row_size, local_hist.data ());
16861691 }
16871692 };
16881693 if (int (workers.size ()) < nthread_use - 1 ) workers.resize (nthread_use - 1 );
0 commit comments