4
4
5
5
#include " grammar-parser.h"
6
6
7
+ #include < random>
7
8
#include < string>
8
- #include < vector>
9
9
#include < unordered_map>
10
+ #include < vector>
10
11
11
12
// sampler types
12
13
enum class llama_sampler_type : char {
@@ -20,25 +21,26 @@ enum class llama_sampler_type : char {
20
21
21
22
// sampling parameters
22
23
typedef struct llama_sampling_params {
23
- int32_t n_prev = 64 ; // number of previous tokens to remember
24
- int32_t n_probs = 0 ; // if greater than 0, output the probabilities of top n_probs tokens.
25
- int32_t min_keep = 0 ; // 0 = disabled, otherwise samplers should return at least min_keep tokens
26
- int32_t top_k = 40 ; // <= 0 to use vocab size
27
- float top_p = 0 .95f ; // 1.0 = disabled
28
- float min_p = 0 .05f ; // 0.0 = disabled
29
- float tfs_z = 1 .00f ; // 1.0 = disabled
30
- float typical_p = 1 .00f ; // 1.0 = disabled
31
- float temp = 0 .80f ; // <= 0.0 to sample greedily, 0.0 to not output probabilities
32
- float dynatemp_range = 0 .00f ; // 0.0 = disabled
33
- float dynatemp_exponent = 1 .00f ; // controls how entropy maps to temperature in dynamic temperature sampler
34
- int32_t penalty_last_n = 64 ; // last n tokens to penalize (0 = disable penalty, -1 = context size)
35
- float penalty_repeat = 1 .00f ; // 1.0 = disabled
36
- float penalty_freq = 0 .00f ; // 0.0 = disabled
37
- float penalty_present = 0 .00f ; // 0.0 = disabled
38
- int32_t mirostat = 0 ; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
39
- float mirostat_tau = 5 .00f ; // target entropy
40
- float mirostat_eta = 0 .10f ; // learning rate
41
- bool penalize_nl = false ; // consider newlines as a repeatable token
24
+ int32_t n_prev = 64 ; // number of previous tokens to remember
25
+ int32_t n_probs = 0 ; // if greater than 0, output the probabilities of top n_probs tokens.
26
+ int32_t min_keep = 0 ; // 0 = disabled, otherwise samplers should return at least min_keep tokens
27
+ int32_t top_k = 40 ; // <= 0 to use vocab size
28
+ float top_p = 0 .95f ; // 1.0 = disabled
29
+ float min_p = 0 .05f ; // 0.0 = disabled
30
+ float tfs_z = 1 .00f ; // 1.0 = disabled
31
+ float typical_p = 1 .00f ; // 1.0 = disabled
32
+ float temp = 0 .80f ; // <= 0.0 to sample greedily, 0.0 to not output probabilities
33
+ float dynatemp_range = 0 .00f ; // 0.0 = disabled
34
+ float dynatemp_exponent = 1 .00f ; // controls how entropy maps to temperature in dynamic temperature sampler
35
+ int32_t penalty_last_n = 64 ; // last n tokens to penalize (0 = disable penalty, -1 = context size)
36
+ float penalty_repeat = 1 .00f ; // 1.0 = disabled
37
+ float penalty_freq = 0 .00f ; // 0.0 = disabled
38
+ float penalty_present = 0 .00f ; // 0.0 = disabled
39
+ int32_t mirostat = 0 ; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
40
+ float mirostat_tau = 5 .00f ; // target entropy
41
+ float mirostat_eta = 0 .10f ; // learning rate
42
+ bool penalize_nl = false ; // consider newlines as a repeatable token
43
+ uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
42
44
43
45
std::vector<llama_sampler_type> samplers_sequence = {
44
46
llama_sampler_type::TOP_K,
@@ -79,6 +81,8 @@ struct llama_sampling_context {
79
81
// TODO: replace with ring-buffer
80
82
std::vector<llama_token> prev;
81
83
std::vector<llama_token_data> cur;
84
+
85
+ std::mt19937 rng;
82
86
};
83
87
84
88
#include " common.h"
@@ -93,6 +97,9 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
93
97
// - reset grammar
94
98
void llama_sampling_reset (llama_sampling_context * ctx);
95
99
100
+ // Set the sampler seed
101
+ void llama_sampling_set_rng_seed (struct llama_sampling_context * ctx, uint32_t seed);
102
+
96
103
// Copy the sampler context
97
104
void llama_sampling_cp (llama_sampling_context * src, llama_sampling_context * dst);
98
105
0 commit comments