Skip to content
Closed
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,9 @@ endif()

add_library(ggml OBJECT
ggml.c
ggml.h)
ggml.h
ggml_extra.h
ggml_extra.cpp)

target_include_directories(ggml PUBLIC .)
target_compile_features(ggml PUBLIC c_std_11) # don't bump
Expand Down
27 changes: 15 additions & 12 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -145,32 +145,35 @@ ggml.o: ggml.c ggml.h
llama.o: llama.cpp llama.h llama_util.h llama_internal.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o

ggml_extra.o: ggml_extra.cpp ggml_extra.h
$(CXX) $(CXXFLAGS) -c $< -o $@

common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o

clean:
rm -vf *.o main quantize quantize-stats perplexity embedding

main: examples/main/main.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
main: examples/main/main.cpp ggml.o llama.o common.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo

quantize: examples/quantize/quantize.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
quantize: examples/quantize/quantize.cpp ggml.o llama.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

libllama.so: llama.o ggml.o
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
libllama.so: llama.o ggml.o ggml_extra.o
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
#
# Tests
#
Expand Down
21 changes: 19 additions & 2 deletions examples/quantize-stats/quantize-stats.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "ggml.h"
#include "llama.h"
#include "llama_internal.h"
#include "ggml_extra.h"

#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -29,7 +30,7 @@ struct quantize_stats_params {
std::vector<enum ggml_type> include_types;
};

const int64_t SCRATCH_ELEMENTS = 32*32;
const int64_t SCRATCH_ELEMENTS = 32*32*256; // So we use multi-threading in a meaningful way in the new quantization
const size_t HISTOGRAM_BUCKETS = 150;
const double HISTOGRAM_RANGE = 0.03;

Expand Down Expand Up @@ -184,6 +185,7 @@ int main(int argc, char ** argv) {
// read command line

bool invalid_param = false;
bool checkNewQuantization = false;
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
Expand Down Expand Up @@ -232,6 +234,8 @@ int main(int argc, char ** argv) {
fprintf(stderr, "error: %s not in list of types\n", argv[i]);
invalid_param = true;
}
} else if (arg == "-nq" || arg == "--new-quantization") {
checkNewQuantization = true;
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
quantize_stats_print_usage(argc, argv);
Expand Down Expand Up @@ -302,11 +306,24 @@ int main(int argc, char ** argv) {
std::vector<float> output_scratch(SCRATCH_ELEMENTS);

// loop throught quantization types
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
//for (int i = 0; i < GGML_TYPE_COUNT; i++) {
for (int i = 0; i < 1; i++) {
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
continue;
}
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
if (i < 2 && checkNewQuantization) {
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1;
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1;
////qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
//if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1;
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1;
//qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1;
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K;
//qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K;
qfns.quantize_row_q = i == 0 ? kQuantizeQ8Simple: kQuantizeQ4_1K;
qfns.dequantize_row_q = i == 0 ? kDequantizeQ8: kDequantizeQ4_1K;
}
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
if (params.verbose) {
printf("testing %s ...\n", type_strs[i]);
Expand Down
2 changes: 2 additions & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ int main(int argc, char ** argv) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
fprintf(stderr, " type = 2 - q4_0\n");
fprintf(stderr, " type = 3 - q4_1\n");
fprintf(stderr, " type = 4 - new q4_0\n");
fprintf(stderr, " type = 5 - new q4_1\n");
return 1;
}

Expand Down
23 changes: 21 additions & 2 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define _GNU_SOURCE

#include "ggml.h"
#include "ggml_extra.h"

#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
Expand Down Expand Up @@ -502,6 +503,13 @@ typedef struct {
} block_q4_1;
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding");

inline int nearestInt(float fval) {
assert(fval <= 4194303.f);
float val = fval + 12582912.f;
int i; memcpy(&i, &val, sizeof(int));
return (i & 0x007fffff) - 0x00400000;
}

// reference implementation for deterministic creation of model files
static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
assert(k % QK == 0);
Expand All @@ -526,8 +534,15 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
const float v0 = x[i*QK + l + 0]*id;
const float v1 = x[i*QK + l + 1]*id;

// On x86_64 and x86, round is amazingly slow.
// Here it is best to just use this:
//const uint8_t vi0 = (uint8_t)(v0 + 8.5f);
//const uint8_t vi1 = (uint8_t)(v1 + 8.5f);
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
// This is marginally slower (but still much faster than round())
//const uint8_t vi0 = nearestInt(v0) + 8;
//const uint8_t vi1 = nearestInt(v1) + 8;

assert(vi0 < 16);
assert(vi1 < 16);
Expand Down Expand Up @@ -818,6 +833,10 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
const float v0 = (x[i*QK + l + 0] - min)*id;
const float v1 = (x[i*QK + l + 1] - min)*id;

// For some reason round() is amazingly slow on X86_64 and x86
// Using this instead reduces the difference between AVX2 and scalar to less than ~15%
//const uint8_t vi0 = nearestInt(v0);
//const uint8_t vi1 = nearestInt(v1);
const uint8_t vi0 = roundf(v0);
const uint8_t vi1 = roundf(v1);

Expand Down Expand Up @@ -2569,7 +2588,7 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
1,
};

static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 7");

static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
sizeof(block_q4_0),
Expand All @@ -2582,7 +2601,7 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
};

// don't forget to update the array above when adding new types
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 7");

static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"NONE",
Expand Down
Loading