@@ -22,6 +22,7 @@ package main
2222import (
2323 "bufio"
2424 "bytes"
25+ "encoding/binary"
2526 "flag"
2627 "fmt"
2728 "go/format"
@@ -55,6 +56,7 @@ const (
5556)
5657
5758var (
59+ combinedText string
5860 maxChildren int
5961 maxTextOffset int
6062 maxTextLength int
@@ -115,11 +117,10 @@ var (
115117 shaRE = regexp .MustCompile (`"sha":"([^"]+)"` )
116118 dateRE = regexp .MustCompile (`"committer":{[^{]+"date":"([^"]+)"` )
117119
118- comments = flag .Bool ("comments" , false , "generate table.go comments, for debugging" )
119- subset = flag .Bool ("subset" , false , "generate only a subset of the full table, for debugging" )
120- url = flag .String ("url" , defaultURL , "URL of the publicsuffix.org list. If empty, stdin is read instead" )
121- v = flag .Bool ("v" , false , "verbose output (to stderr)" )
122- version = flag .String ("version" , "" , "the effective_tld_names.dat version" )
120+ subset = flag .Bool ("subset" , false , "generate only a subset of the full table, for debugging" )
121+ url = flag .String ("url" , defaultURL , "URL of the publicsuffix.org list. If empty, stdin is read instead" )
122+ v = flag .Bool ("v" , false , "verbose output (to stderr)" )
123+ version = flag .String ("version" , "" , "the effective_tld_names.dat version" )
123124)
124125
125126func main () {
@@ -254,7 +255,33 @@ func main1() error {
254255 }
255256 sort .Strings (labelsList )
256257
257- if err := generate (printReal , & root , "table.go" ); err != nil {
258+ combinedText = combineText (labelsList )
259+ if combinedText == "" {
260+ return fmt .Errorf ("internal error: combineText returned no text" )
261+ }
262+ for _ , label := range labelsList {
263+ offset , length := strings .Index (combinedText , label ), len (label )
264+ if offset < 0 {
265+ return fmt .Errorf ("internal error: could not find %q in text %q" , label , combinedText )
266+ }
267+ maxTextOffset , maxTextLength = max (maxTextOffset , offset ), max (maxTextLength , length )
268+ if offset >= 1 << nodesBitsTextOffset {
269+ return fmt .Errorf ("text offset %d is too large, or nodeBitsTextOffset is too small" , offset )
270+ }
271+ if length >= 1 << nodesBitsTextLength {
272+ return fmt .Errorf ("text length %d is too large, or nodeBitsTextLength is too small" , length )
273+ }
274+ labelEncoding [label ] = uint64 (offset )<< nodesBitsTextLength | uint64 (length )
275+ }
276+
277+ if err := root .walk (assignIndexes ); err != nil {
278+ return err
279+ }
280+
281+ if err := generate (printMetadata , & root , "table.go" ); err != nil {
282+ return err
283+ }
284+ if err := generateBinaryData (& root , combinedText ); err != nil {
258285 return err
259286 }
260287 if err := generate (printTest , & root , "table_test.go" ); err != nil {
@@ -307,18 +334,63 @@ func printTest(w io.Writer, n *node) error {
307334 fmt .Fprintf (w , "%q,\n " , rule )
308335 }
309336 fmt .Fprintf (w , "}\n \n var nodeLabels = [...]string{\n " )
310- if err := n .walk (w , printNodeLabel ); err != nil {
337+ if err := n .walk (func (n * node ) error {
338+ return printNodeLabel (w , n )
339+ }); err != nil {
311340 return err
312341 }
313342 fmt .Fprintf (w , "}\n " )
314343 return nil
315344}
316345
317- func printReal (w io.Writer , n * node ) error {
346+ func generateBinaryData (root * node , combinedText string ) error {
347+ if err := os .WriteFile ("data/text" , []byte (combinedText ), 0666 ); err != nil {
348+ return err
349+ }
350+
351+ var nodes []byte
352+ if err := root .walk (func (n * node ) error {
353+ for _ , c := range n .children {
354+ nodes = appendNodeEncoding (nodes , c )
355+ }
356+ return nil
357+ }); err != nil {
358+ return err
359+ }
360+ if err := os .WriteFile ("data/nodes" , nodes , 0666 ); err != nil {
361+ return err
362+ }
363+
364+ var children []byte
365+ for _ , c := range childrenEncoding {
366+ children = binary .BigEndian .AppendUint32 (children , c )
367+ }
368+ if err := os .WriteFile ("data/children" , children , 0666 ); err != nil {
369+ return err
370+ }
371+
372+ return nil
373+ }
374+
375+ func appendNodeEncoding (b []byte , n * node ) []byte {
376+ encoding := labelEncoding [n .label ]
377+ if n .icann {
378+ encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset )
379+ }
380+ encoding |= uint64 (n .childrenIndex ) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN )
381+ for i := nodesBits - 8 ; i >= 0 ; i -= 8 {
382+ b = append (b , byte ((encoding >> i )& 0xff ))
383+ }
384+ return b
385+ }
386+
387+ func printMetadata (w io.Writer , n * node ) error {
318388 const header = `// generated by go run gen.go; DO NOT EDIT
319389
320390package publicsuffix
321391
392+ import _ "embed"
393+
322394const version = %q
323395
324396const (
@@ -343,74 +415,36 @@ const (
343415// numTLD is the number of top level domains.
344416const numTLD = %d
345417
418+ // text is the combined text of all labels.
419+ //
420+ //go:embed data/text
421+ var text string
422+
346423`
347424 fmt .Fprintf (w , header , * version ,
348425 nodesBits ,
349426 nodesBitsChildren , nodesBitsICANN , nodesBitsTextOffset , nodesBitsTextLength ,
350427 childrenBitsWildcard , childrenBitsNodeType , childrenBitsHi , childrenBitsLo ,
351428 nodeTypeNormal , nodeTypeException , nodeTypeParentOnly , len (n .children ))
352-
353- text := combineText (labelsList )
354- if text == "" {
355- return fmt .Errorf ("internal error: makeText returned no text" )
356- }
357- for _ , label := range labelsList {
358- offset , length := strings .Index (text , label ), len (label )
359- if offset < 0 {
360- return fmt .Errorf ("internal error: could not find %q in text %q" , label , text )
361- }
362- maxTextOffset , maxTextLength = max (maxTextOffset , offset ), max (maxTextLength , length )
363- if offset >= 1 << nodesBitsTextOffset {
364- return fmt .Errorf ("text offset %d is too large, or nodeBitsTextOffset is too small" , offset )
365- }
366- if length >= 1 << nodesBitsTextLength {
367- return fmt .Errorf ("text length %d is too large, or nodeBitsTextLength is too small" , length )
368- }
369- labelEncoding [label ] = uint64 (offset )<< nodesBitsTextLength | uint64 (length )
370- }
371- fmt .Fprintf (w , "// Text is the combined text of all labels.\n const text = " )
372- for len (text ) > 0 {
373- n , plus := len (text ), ""
374- if n > 64 {
375- n , plus = 64 , " +"
376- }
377- fmt .Fprintf (w , "%q%s\n " , text [:n ], plus )
378- text = text [n :]
379- }
380-
381- if err := n .walk (w , assignIndexes ); err != nil {
382- return err
383- }
384-
385429 fmt .Fprintf (w , `
386-
387430// nodes is the list of nodes. Each node is represented as a %v-bit integer,
388431// which encodes the node's children, wildcard bit and node type (as an index
389432// into the children array), ICANN bit and text.
390433//
391- // If the table was generated with the -comments flag, there is a //-comment
392- // after each node's data. In it is the nodes-array indexes of the children,
393- // formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
394- // nodeType is printed as + for normal, ! for exception, and o for parent-only
395- // nodes that have children but don't match a domain label in their own right.
396- // An I denotes an ICANN domain.
397- //
398434// The layout within the node, from MSB to LSB, is:
399435// [%2d bits] unused
400436// [%2d bits] children index
401437// [%2d bits] ICANN bit
402438// [%2d bits] text index
403439// [%2d bits] text length
404- var nodes = [...]uint8{
440+ //
441+ //go:embed data/nodes
442+ var nodes uint40String
405443` ,
406444 nodesBits ,
407445 nodesBits - nodesBitsChildren - nodesBitsICANN - nodesBitsTextOffset - nodesBitsTextLength ,
408446 nodesBitsChildren , nodesBitsICANN , nodesBitsTextOffset , nodesBitsTextLength )
409- if err := n .walk (w , printNode ); err != nil {
410- return err
411- }
412- fmt .Fprintf (w , `}
413-
447+ fmt .Fprintf (w , `
414448// children is the list of nodes' children, the parent's wildcard bit and the
415449// parent's node type. If a node has no children then their children index
416450// will be in the range [0, 6), depending on the wildcard bit and node type.
@@ -421,27 +455,13 @@ var nodes = [...]uint8{
421455// [%2d bits] node type
422456// [%2d bits] high nodes index (exclusive) of children
423457// [%2d bits] low nodes index (inclusive) of children
424- var children=[...]uint32{
458+ //
459+ //go:embed data/children
460+ var children uint32String
425461` ,
426462 32 - childrenBitsWildcard - childrenBitsNodeType - childrenBitsHi - childrenBitsLo ,
427463 childrenBitsWildcard , childrenBitsNodeType , childrenBitsHi , childrenBitsLo )
428- for i , c := range childrenEncoding {
429- s := "---------------"
430- lo := c & (1 << childrenBitsLo - 1 )
431- hi := (c >> childrenBitsLo ) & (1 << childrenBitsHi - 1 )
432- if lo != hi {
433- s = fmt .Sprintf ("n0x%04x-n0x%04x" , lo , hi )
434- }
435- nodeType := int (c >> (childrenBitsLo + childrenBitsHi )) & (1 << childrenBitsNodeType - 1 )
436- wildcard := c >> (childrenBitsLo + childrenBitsHi + childrenBitsNodeType ) != 0
437- if * comments {
438- fmt .Fprintf (w , "0x%08x, // c0x%04x (%s)%s %s\n " ,
439- c , i , s , wildcardStr (wildcard ), nodeTypeStr (nodeType ))
440- } else {
441- fmt .Fprintf (w , "0x%x,\n " , c )
442- }
443- }
444- fmt .Fprintf (w , "}\n \n " )
464+
445465 fmt .Fprintf (w , "// max children %d (capacity %d)\n " , maxChildren , 1 << nodesBitsChildren - 1 )
446466 fmt .Fprintf (w , "// max text offset %d (capacity %d)\n " , maxTextOffset , 1 << nodesBitsTextOffset - 1 )
447467 fmt .Fprintf (w , "// max text length %d (capacity %d)\n " , maxTextLength , 1 << nodesBitsTextLength - 1 )
@@ -465,12 +485,12 @@ type node struct {
465485 children []* node
466486}
467487
468- func (n * node ) walk (w io. Writer , f func (w1 io. Writer , n1 * node ) error ) error {
469- if err := f (w , n ); err != nil {
488+ func (n * node ) walk (f func (* node ) error ) error {
489+ if err := f (n ); err != nil {
470490 return err
471491 }
472492 for _ , c := range n .children {
473- if err := c .walk (w , f ); err != nil {
493+ if err := c .walk (f ); err != nil {
474494 return err
475495 }
476496 }
@@ -516,7 +536,7 @@ var childrenEncoding = []uint32{
516536
517537var firstCallToAssignIndexes = true
518538
519- func assignIndexes (w io. Writer , n * node ) error {
539+ func assignIndexes (n * node ) error {
520540 if len (n .children ) != 0 {
521541 // Assign nodesIndex.
522542 n .firstChild = nextNodesIndex
@@ -561,32 +581,6 @@ func assignIndexes(w io.Writer, n *node) error {
561581 return nil
562582}
563583
564- func printNode (w io.Writer , n * node ) error {
565- for _ , c := range n .children {
566- s := "---------------"
567- if len (c .children ) != 0 {
568- s = fmt .Sprintf ("n0x%04x-n0x%04x" , c .firstChild , c .firstChild + len (c .children ))
569- }
570- encoding := labelEncoding [c .label ]
571- if c .icann {
572- encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset )
573- }
574- encoding |= uint64 (c .childrenIndex ) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN )
575- for i := nodesBits - 8 ; i >= 0 ; i -= 8 {
576- fmt .Fprintf (w , "0x%02x, " , (encoding >> i )& 0xff )
577- }
578- if * comments {
579- fmt .Fprintf (w , "// n0x%04x c0x%04x (%s)%s %s %s %s\n " ,
580- c .nodesIndex , c .childrenIndex , s , wildcardStr (c .wildcard ),
581- nodeTypeStr (c .nodeType ), icannStr (c .icann ), c .label ,
582- )
583- } else {
584- fmt .Fprintf (w , "\n " )
585- }
586- }
587- return nil
588- }
589-
590584func printNodeLabel (w io.Writer , n * node ) error {
591585 for _ , c := range n .children {
592586 fmt .Fprintf (w , "%q,\n " , c .label )
0 commit comments