1
1
import random
2
2
from collections import defaultdict
3
3
from math import ceil
4
- from typing import TYPE_CHECKING , Optional , Union
4
+ from typing import TYPE_CHECKING , Any , Optional , Union
5
5
6
+ import httpx
6
7
from pydantic import BaseModel , computed_field
7
8
8
9
if TYPE_CHECKING :
9
10
from guidellm .benchmark .benchmark import GenerativeBenchmark
10
11
12
+ from guidellm .dataset .file import FileDatasetCreator
13
+ from guidellm .dataset .hf_datasets import HFDatasetsCreator
14
+ from guidellm .dataset .in_memory import InMemoryDatasetCreator
15
+ from guidellm .dataset .synthetic import SyntheticDatasetConfig , SyntheticDatasetCreator
11
16
from guidellm .objects .statistics import DistributionSummary
12
17
13
18
@@ -58,6 +63,41 @@ class Model(BaseModel):
58
63
class Dataset (BaseModel ):
59
64
name : str
60
65
66
+ @classmethod
67
+ def from_data (cls , request_loader : Any ):
68
+ creators = [
69
+ InMemoryDatasetCreator ,
70
+ SyntheticDatasetCreator ,
71
+ FileDatasetCreator ,
72
+ HFDatasetsCreator ,
73
+ ]
74
+ dataset_name = None
75
+ data = request_loader .data
76
+ data_args = request_loader .data_args
77
+ processor = request_loader .processor
78
+ processor_args = request_loader .processor_args
79
+
80
+ for creator in creators :
81
+ if not creator .is_supported (data , None ):
82
+ continue
83
+ random_seed = 42
84
+ dataset = creator .handle_create (
85
+ data , data_args , processor , processor_args , random_seed
86
+ )
87
+ dataset_name = creator .extract_dataset_name (dataset )
88
+ if dataset_name is None or dataset_name == "" :
89
+ if creator == SyntheticDatasetCreator :
90
+ data_dict = SyntheticDatasetConfig .parse_str (data )
91
+ dataset_name = data_dict .source
92
+ if creator == FileDatasetCreator or isinstance (
93
+ creator , HFDatasetsCreator
94
+ ):
95
+ dataset_name = data
96
+ if creator == InMemoryDatasetCreator :
97
+ dataset_name = "In-memory"
98
+ break
99
+ return cls (name = dataset_name or "" )
100
+
61
101
62
102
class RunInfo (BaseModel ):
63
103
model : Model
@@ -71,11 +111,14 @@ def from_benchmarks(cls, benchmarks: list["GenerativeBenchmark"]):
71
111
timestamp = max (
72
112
bm .run_stats .start_time for bm in benchmarks if bm .start_time is not None
73
113
)
114
+ response = httpx .get (f"https://huggingface.co/api/models/{ model } " )
115
+ model_json = response .json ()
116
+
74
117
return cls (
75
- model = Model (name = model , size = 0 ),
118
+ model = Model (name = model , size = model_json . get ( "usedStorage" , 0 ) ),
76
119
task = "N/A" ,
77
120
timestamp = timestamp ,
78
- dataset = Dataset ( name = "N/A" ),
121
+ dataset = Dataset . from_data ( benchmarks [ 0 ]. request_loader ),
79
122
)
80
123
81
124
0 commit comments