22
33# Model
44model :
5- _target_ : torchao ._models.sam2.modeling.sam2_base.SAM2Base
5+ _target_ : benchmarks ._models.sam2.modeling.sam2_base.SAM2Base
66 image_encoder :
7- _target_ : torchao ._models.sam2.modeling.backbones.image_encoder.ImageEncoder
7+ _target_ : benchmarks ._models.sam2.modeling.backbones.image_encoder.ImageEncoder
88 scalp : 1
99 trunk :
10- _target_ : torchao ._models.sam2.modeling.backbones.hieradet.Hiera
10+ _target_ : benchmarks ._models.sam2.modeling.backbones.hieradet.Hiera
1111 embed_dim : 144
1212 num_heads : 2
1313 stages : [2, 6, 36, 4]
1414 global_att_blocks : [23, 33, 43]
1515 window_pos_embed_bkg_spatial_size : [7, 7]
1616 window_spec : [8, 4, 16, 8]
1717 neck :
18- _target_ : torchao ._models.sam2.modeling.backbones.image_encoder.FpnNeck
18+ _target_ : benchmarks ._models.sam2.modeling.backbones.image_encoder.FpnNeck
1919 position_encoding :
20- _target_ : torchao ._models.sam2.modeling.position_encoding.PositionEmbeddingSine
20+ _target_ : benchmarks ._models.sam2.modeling.position_encoding.PositionEmbeddingSine
2121 num_pos_feats : 256
2222 normalize : true
2323 scale : null
@@ -28,17 +28,17 @@ model:
2828 fpn_interp_model : nearest
2929
3030 memory_attention :
31- _target_ : torchao ._models.sam2.modeling.memory_attention.MemoryAttention
31+ _target_ : benchmarks ._models.sam2.modeling.memory_attention.MemoryAttention
3232 d_model : 256
3333 pos_enc_at_input : true
3434 layer :
35- _target_ : torchao ._models.sam2.modeling.memory_attention.MemoryAttentionLayer
35+ _target_ : benchmarks ._models.sam2.modeling.memory_attention.MemoryAttentionLayer
3636 activation : relu
3737 dim_feedforward : 2048
3838 dropout : 0.1
3939 pos_enc_at_attn : false
4040 self_attention :
41- _target_ : torchao ._models.sam2.modeling.sam.transformer.RoPEAttention
41+ _target_ : benchmarks ._models.sam2.modeling.sam.transformer.RoPEAttention
4242 rope_theta : 10000.0
4343 feat_sizes : [32, 32]
4444 embedding_dim : 256
4949 pos_enc_at_cross_attn_keys : true
5050 pos_enc_at_cross_attn_queries : false
5151 cross_attention :
52- _target_ : torchao ._models.sam2.modeling.sam.transformer.RoPEAttention
52+ _target_ : benchmarks ._models.sam2.modeling.sam.transformer.RoPEAttention
5353 rope_theta : 10000.0
5454 feat_sizes : [32, 32]
5555 rope_k_repeat : True
@@ -61,23 +61,23 @@ model:
6161 num_layers : 4
6262
6363 memory_encoder :
64- _target_ : torchao ._models.sam2.modeling.memory_encoder.MemoryEncoder
64+ _target_ : benchmarks ._models.sam2.modeling.memory_encoder.MemoryEncoder
6565 out_dim : 64
6666 position_encoding :
67- _target_ : torchao ._models.sam2.modeling.position_encoding.PositionEmbeddingSine
67+ _target_ : benchmarks ._models.sam2.modeling.position_encoding.PositionEmbeddingSine
6868 num_pos_feats : 64
6969 normalize : true
7070 scale : null
7171 temperature : 10000
7272 mask_downsampler :
73- _target_ : torchao ._models.sam2.modeling.memory_encoder.MaskDownSampler
73+ _target_ : benchmarks ._models.sam2.modeling.memory_encoder.MaskDownSampler
7474 kernel_size : 3
7575 stride : 2
7676 padding : 1
7777 fuser :
78- _target_ : torchao ._models.sam2.modeling.memory_encoder.Fuser
78+ _target_ : benchmarks ._models.sam2.modeling.memory_encoder.Fuser
7979 layer :
80- _target_ : torchao ._models.sam2.modeling.memory_encoder.CXBlock
80+ _target_ : benchmarks ._models.sam2.modeling.memory_encoder.CXBlock
8181 dim : 256
8282 kernel_size : 7
8383 padding : 3
0 commit comments