@@ -39,28 +39,60 @@ namespace {
39
39
const Token RestrictedForCodeCompletion =
40
40
Token (Token::Kind::Sentinel, " Restricted For Code Completion" );
41
41
42
- // Returns the tokens which are given symbol's characteristics. Currently, the
43
- // generated tokens only contain fuzzy matching trigrams and symbol's scope,
44
- // but in the future this will also return path proximity tokens and other
45
- // types of tokens such as symbol type (if applicable).
46
- // Returns the tokens which are given symbols's characteristics. For example,
47
- // trigrams and scopes.
48
- // FIXME(kbobyrev): Support more token types:
49
- // * Namespace proximity
50
- std::vector<Token> generateSearchTokens (const Symbol &Sym) {
51
- std::vector<Token> Result = generateIdentifierTrigrams (Sym.Name );
52
- Result.emplace_back (Token::Kind::Scope, Sym.Scope );
53
- // Skip token generation for symbols with unknown declaration location.
54
- if (!llvm::StringRef (Sym.CanonicalDeclaration .FileURI ).empty ())
55
- for (const auto &ProximityURI :
56
- generateProximityURIs (Sym.CanonicalDeclaration .FileURI ))
57
- Result.emplace_back (Token::Kind::ProximityURI, ProximityURI);
58
- if (Sym.Flags & Symbol::IndexedForCodeCompletion)
59
- Result.emplace_back (RestrictedForCodeCompletion);
60
- if (!Sym.Type .empty ())
61
- Result.emplace_back (Token::Kind::Type, Sym.Type );
62
- return Result;
63
- }
42
+ // Helper to efficiently assemble the inverse index (token -> matching docs).
43
+ // The output is a nice uniform structure keyed on Token, but constructing
44
+ // the Token object every time we want to insert into the map is wasteful.
45
+ // Instead we have various maps keyed on things that are cheap to compute,
46
+ // and produce the Token keys once at the end.
47
+ class IndexBuilder {
48
+ llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
49
+ std::vector<DocID> RestrictedCCDocs;
50
+ llvm::StringMap<std::vector<DocID>> TypeDocs;
51
+ llvm::StringMap<std::vector<DocID>> ScopeDocs;
52
+ llvm::StringMap<std::vector<DocID>> ProximityDocs;
53
+ std::vector<Trigram> TrigramScratch;
54
+
55
+ public:
56
+ // Add the tokens which are given symbol's characteristics.
57
+ // This includes fuzzy matching trigrams, symbol's scope, etc.
58
+ // FIXME(kbobyrev): Support more token types:
59
+ // * Namespace proximity
60
+ void add (const Symbol &Sym, DocID D) {
61
+ generateIdentifierTrigrams (Sym.Name , TrigramScratch);
62
+ for (Trigram T : TrigramScratch)
63
+ TrigramDocs[T].push_back (D);
64
+ ScopeDocs[Sym.Scope ].push_back (D);
65
+ if (!llvm::StringRef (Sym.CanonicalDeclaration .FileURI ).empty ())
66
+ for (const auto &ProximityURI :
67
+ generateProximityURIs (Sym.CanonicalDeclaration .FileURI ))
68
+ ProximityDocs[ProximityURI].push_back (D);
69
+ if (Sym.Flags & Symbol::IndexedForCodeCompletion)
70
+ RestrictedCCDocs.push_back (D);
71
+ if (!Sym.Type .empty ())
72
+ TypeDocs[Sym.Type ].push_back (D);
73
+ }
74
+
75
+ // Assemble the final compressed posting lists for the added symbols.
76
+ llvm::DenseMap<Token, PostingList> build () {
77
+ llvm::DenseMap<Token, PostingList> Result (/* InitialReserve=*/
78
+ TrigramDocs.size () +
79
+ RestrictedCCDocs.size () +
80
+ TypeDocs.size () +
81
+ ScopeDocs.size () +
82
+ ProximityDocs.size ());
83
+ for (const auto &E : TrigramDocs)
84
+ Result.try_emplace (Token (Token::Kind::Trigram, E.first .str ()), E.second );
85
+ for (const auto &E : TypeDocs)
86
+ Result.try_emplace (Token (Token::Kind::Type, E.first ()), E.second );
87
+ for (const auto &E : ScopeDocs)
88
+ Result.try_emplace (Token (Token::Kind::Scope, E.first ()), E.second );
89
+ for (const auto &E : ProximityDocs)
90
+ Result.try_emplace (Token (Token::Kind::ProximityURI, E.first ()), E.second );
91
+ if (!RestrictedCCDocs.empty ())
92
+ Result.try_emplace (RestrictedForCodeCompletion, RestrictedCCDocs);
93
+ return Result;
94
+ }
95
+ };
64
96
65
97
} // namespace
66
98
@@ -86,18 +118,11 @@ void Dex::buildIndex() {
86
118
Symbols[I] = ScoredSymbols[I].second ;
87
119
}
88
120
89
- // Populate TempInvertedIndex with lists for index symbols.
90
- llvm::DenseMap<Token, std::vector<DocID>> TempInvertedIndex;
91
- for (DocID SymbolRank = 0 ; SymbolRank < Symbols.size (); ++SymbolRank) {
92
- const auto *Sym = Symbols[SymbolRank];
93
- for (const auto &Token : generateSearchTokens (*Sym))
94
- TempInvertedIndex[Token].push_back (SymbolRank);
95
- }
96
-
97
- // Convert lists of items to posting lists.
98
- for (const auto &TokenToPostingList : TempInvertedIndex)
99
- InvertedIndex.insert (
100
- {TokenToPostingList.first , PostingList (TokenToPostingList.second )});
121
+ // Build posting lists for symbols.
122
+ IndexBuilder Builder;
123
+ for (DocID SymbolRank = 0 ; SymbolRank < Symbols.size (); ++SymbolRank)
124
+ Builder.add (*Symbols[SymbolRank], SymbolRank);
125
+ InvertedIndex = Builder.build ();
101
126
}
102
127
103
128
std::unique_ptr<Iterator> Dex::iterator (const Token &Tok) const {
0 commit comments