ggml-org · ikawrakow · Apr 11, 2023 · Apr 11, 2023 · Apr 11, 2023 · Apr 11, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -235,7 +235,9 @@ endif()
 
 add_library(ggml OBJECT
             ggml.c
-            ggml.h)
+            ggml.h
+            ggml_extra.h
+            ggml_extra.cpp)
 
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump

diff --git a/Makefile b/Makefile
@@ -145,32 +145,35 @@ ggml.o: ggml.c ggml.h
 llama.o: llama.cpp llama.h llama_util.h llama_internal.h
 	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
 
+ggml_extra.o: ggml_extra.cpp ggml_extra.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
 
 clean:
 	rm -vf *.o main quantize quantize-stats perplexity embedding
 
-main: examples/main/main.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
+main: examples/main/main.cpp ggml.o llama.o common.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
-quantize: examples/quantize/quantize.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
+quantize: examples/quantize/quantize.cpp ggml.o llama.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
+quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
+perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
+embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-libllama.so: llama.o ggml.o
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
+libllama.so: llama.o ggml.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 #
 # Tests
 #

diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
@@ -1,6 +1,7 @@
 #include "ggml.h"
 #include "llama.h"
 #include "llama_internal.h"
+#include "ggml_extra.h"
 
 #include <algorithm>
 #include <cassert>
@@ -29,7 +30,7 @@ struct quantize_stats_params {
     std::vector<enum ggml_type> include_types;
 };
 
-const int64_t SCRATCH_ELEMENTS = 32*32;
+const int64_t SCRATCH_ELEMENTS = 32*32*256; // So we use multi-threading in a meaningful way in the new quantization
 const size_t HISTOGRAM_BUCKETS = 150;
 const double HISTOGRAM_RANGE = 0.03;
 
@@ -184,6 +185,7 @@ int main(int argc, char ** argv) {
     // read command line
 
     bool invalid_param = false;
+    bool checkNewQuantization = false;
     std::string arg;
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -232,6 +234,8 @@ int main(int argc, char ** argv) {
                 fprintf(stderr, "error: %s not in list of types\n", argv[i]);
                 invalid_param = true;
             }
+        } else if (arg == "-nq" || arg == "--new-quantization") {
+            checkNewQuantization = true;
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             quantize_stats_print_usage(argc, argv);
@@ -302,11 +306,24 @@ int main(int argc, char ** argv) {
     std::vector<float> output_scratch(SCRATCH_ELEMENTS);
 
     // loop throught quantization types
-    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+    //for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+    for (int i = 0; i < 1; i++) {
         if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
             continue;
         }
         quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        if (i < 2 && checkNewQuantization) {
+            //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1;
+            //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1;
+            ////qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
+            //if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1;
+            //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1;
+            //qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1;
+            //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K;
+            //qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K;
+            qfns.quantize_row_q = i == 0 ? kQuantizeQ8Simple: kQuantizeQ4_1K;
+            qfns.dequantize_row_q = i == 0 ? kDequantizeQ8: kDequantizeQ4_1K;
+        }
         if (qfns.quantize_row_q && qfns.dequantize_row_q) {
             if (params.verbose) {
                 printf("testing %s ...\n",  type_strs[i]);

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -14,6 +14,8 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
         fprintf(stderr, "  type = 2 - q4_0\n");
         fprintf(stderr, "  type = 3 - q4_1\n");
+        fprintf(stderr, "  type = 4 - new q4_0\n");
+        fprintf(stderr, "  type = 5 - new q4_1\n");
         return 1;
     }
 

diff --git a/ggml.c b/ggml.c
@@ -2,6 +2,7 @@
 #define _GNU_SOURCE
 
 #include "ggml.h"
+#include "ggml_extra.h"
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -502,6 +503,13 @@ typedef struct {
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding");
 
+inline int nearestInt(float fval) {
+    assert(fval <= 4194303.f);
+    float val = fval + 12582912.f;
+    int i; memcpy(&i, &val, sizeof(int));
+    return (i & 0x007fffff) - 0x00400000;
+}
+
 // reference implementation for deterministic creation of model files
 static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
     assert(k % QK == 0);
@@ -526,8 +534,15 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
             const float v0 = x[i*QK + l + 0]*id;
             const float v1 = x[i*QK + l + 1]*id;
 
+            // On x86_64 and x86, round is amazingly slow.
+            // Here it is best to just use this:
+            //const uint8_t vi0 = (uint8_t)(v0 + 8.5f);
+            //const uint8_t vi1 = (uint8_t)(v1 + 8.5f);
             const uint8_t vi0 = (int8_t)roundf(v0) + 8;
             const uint8_t vi1 = (int8_t)roundf(v1) + 8;
+            // This is marginally slower (but still much faster than round())
+            //const uint8_t vi0 = nearestInt(v0) + 8;
+            //const uint8_t vi1 = nearestInt(v1) + 8;
 
             assert(vi0 < 16);
             assert(vi1 < 16);
@@ -818,6 +833,10 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
             const float v0 = (x[i*QK + l + 0] - min)*id;
             const float v1 = (x[i*QK + l + 1] - min)*id;
 
+            // For some reason round() is amazingly slow on X86_64 and x86
+            // Using this instead reduces the difference between AVX2 and scalar to less than ~15%
+            //const uint8_t vi0 = nearestInt(v0);
+            //const uint8_t vi1 = nearestInt(v1);
             const uint8_t vi0 = roundf(v0);
             const uint8_t vi1 = roundf(v1);
 
@@ -2569,7 +2588,7 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
     1,
 };
 
-static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
+static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 7");
 
 static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
     sizeof(block_q4_0),
@@ -2582,7 +2601,7 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
 };
 
 // don't forget to update the array above when adding new types
-static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
+static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 7");
 
 static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "NONE",