Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit e1c7bc6

Browse files
authored
Define TORCHTEXT_API macro for visibility control (#1806)
## Context: TorchText uses dual-binding (PyBind11 and TorchBind) to make custom operations available in Python. The both binding eventually calls the same implementation contained in `libtorchtext.so`. The ones bound via PyBind11 (the ones in `torchtext._torchtext`) calls into `libtorchtext.so`. ![Untitled drawing](https://user-images.githubusercontent.com/855818/175428489-c288b3cc-0b9f-4230-95ed-fd7c063bb6fa.jpg) This means that `libtorchtext.so` has to make the symbols (APIs) used by `torchtext._torchtext` visible. However, the default visibility of symbols in shared libraries are different in Windows. On Windows all the symbols are by default hidden. To work around this, we use `CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS` to expose all the symbols. There is an upper limit of visible symbols one library fine can contain, and this can be problematic in the future. (Although it is unlikely that torchtext will hit the limit, unless it introduces custom CUDA kernels.) A better approach is to selectively mark the symbols that should be visible as visible. ## Summary of the change set This commit introduces `TORCHTEXT_API` macro which annotates functions with proper visibility. The core logic was taken from https://github.com/pytorch/pytorch/blob/bcc02769bef1d7b89bec724223284958b7c5b564/c10/macros/Export.h The behavior is as follow; For non-Windows: It is always `__attribute__((__visibility__("default")))` For Windows: If the header is included from the compilation unit of `libtorchtext`, then it resolves to `__declspec(dllexport)`. otherwise it resolves to `__declspec(dllimport)`. This allows to remove `CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS`.
1 parent a4a81d2 commit e1c7bc6

File tree

10 files changed

+132
-78
lines changed

10 files changed

+132
-78
lines changed

torchtext/csrc/CMakeLists.txt

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,3 @@
1-
# the following line is added in order to export symbols when building on Windows
2-
# this approach has some limitations as documented in https://github.com/pytorch/pytorch/pull/3650
3-
if (MSVC)
4-
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
5-
endif()
6-
71
################################################################################
82
# libtorchtext
93
################################################################################
@@ -41,7 +35,8 @@ set(
4135
)
4236

4337
set(
44-
LIBTORCHTEXT_COMPILE_DEFINITIONS)
38+
LIBTORCHTEXT_COMPILE_DEFINITIONS
39+
TORCHTEXT_BUILD_MAIN_LIB)
4540

4641
function (define_library name source include_dirs link_libraries compile_defs)
4742
add_library(${name} SHARED ${source})

torchtext/csrc/bert_tokenizer.h

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include <torchtext/csrc/export.h>
12
#include <torchtext/csrc/vocab.h>
23
#include <string>
34
#include <vector>
@@ -11,19 +12,20 @@ typedef std::tuple<bool, c10::optional<bool>, std::vector<std::string>>
1112
BERTEncoderStates;
1213

1314
struct BERTEncoder : torch::CustomClassHolder {
14-
BERTEncoder(
15+
TORCHTEXT_API BERTEncoder(
1516
const std::string& vocab_file,
1617
bool do_lower_case,
1718
c10::optional<bool> strip_accents);
1819
BERTEncoder(
1920
Vocab vocab,
2021
bool do_lower_case,
2122
c10::optional<bool> strip_accents);
22-
std::vector<std::string> Tokenize(std::string text);
23-
std::vector<int64_t> Encode(std::string text);
24-
std::vector<std::vector<std::string>> BatchTokenize(
23+
TORCHTEXT_API std::vector<std::string> Tokenize(std::string text);
24+
TORCHTEXT_API std::vector<int64_t> Encode(std::string text);
25+
TORCHTEXT_API std::vector<std::vector<std::string>> BatchTokenize(
26+
std::vector<std::string> text);
27+
TORCHTEXT_API std::vector<std::vector<int64_t>> BatchEncode(
2528
std::vector<std::string> text);
26-
std::vector<std::vector<int64_t>> BatchEncode(std::vector<std::string> text);
2729

2830
Vocab vocab_;
2931
bool do_lower_case_;
@@ -40,8 +42,8 @@ struct BERTEncoder : torch::CustomClassHolder {
4042
static std::string kUnkToken;
4143
};
4244

43-
BERTEncoderStates _serialize_bert_encoder(
44-
const c10::intrusive_ptr<BERTEncoder>& self);
45-
c10::intrusive_ptr<BERTEncoder> _deserialize_bert_encoder(
45+
TORCHTEXT_API BERTEncoderStates
46+
_serialize_bert_encoder(const c10::intrusive_ptr<BERTEncoder>& self);
47+
TORCHTEXT_API c10::intrusive_ptr<BERTEncoder> _deserialize_bert_encoder(
4648
BERTEncoderStates states);
4749
} // namespace torchtext

torchtext/csrc/clip_tokenizer.h

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#ifndef CLIP_TOKENIZER_H_
22
#define CLIP_TOKENIZER_H_
33

4+
#include <torchtext/csrc/export.h>
45
#include <torchtext/csrc/gpt2_bpe_tokenizer.h>
56

67
namespace torchtext {
@@ -25,21 +26,22 @@ struct CLIPEncoder : GPT2BPEEncoder {
2526
public:
2627
using GPT2BPEEncoder::GPT2BPEEncoder;
2728

28-
std::vector<int64_t> Encode(const std::string& text);
29-
std::vector<std::string> Tokenize(const std::string& text);
29+
TORCHTEXT_API std::vector<int64_t> Encode(const std::string& text);
30+
TORCHTEXT_API std::vector<std::string> Tokenize(const std::string& text);
3031

3132
protected:
32-
std::vector<std::string> BPE_(
33+
TORCHTEXT_API std::vector<std::string> BPE_(
3334
const std::vector<std::string>& token_list) override;
3435

35-
std::vector<std::string> PreTokenize_(std::string input) override;
36+
TORCHTEXT_API std::vector<std::string> PreTokenize_(
37+
std::string input) override;
3638
};
3739

38-
CLIPEncoderStatesPybind _serialize_clip_encoder_pybind(
39-
const c10::intrusive_ptr<CLIPEncoder>& self);
40+
TORCHTEXT_API CLIPEncoderStatesPybind
41+
_serialize_clip_encoder_pybind(const c10::intrusive_ptr<CLIPEncoder>& self);
4042
CLIPEncoderStatesTorchbind _serialize_clip_encoder_torchbind(
4143
const c10::intrusive_ptr<CLIPEncoder>& self);
42-
c10::intrusive_ptr<CLIPEncoder> _deserialize_clip_encoder_pybind(
44+
TORCHTEXT_API c10::intrusive_ptr<CLIPEncoder> _deserialize_clip_encoder_pybind(
4345
CLIPEncoderStatesPybind states);
4446
c10::intrusive_ptr<CLIPEncoder> _deserialize_clip_encoder_torchbind(
4547
CLIPEncoderStatesTorchbind states);

torchtext/csrc/export.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#pragma once
2+
3+
// Define the visibility of symbols.
4+
// The original logic and background can be found here.
5+
// https://github.com/pytorch/pytorch/blob/bcc02769bef1d7b89bec724223284958b7c5b564/c10/macros/Export.h#L49-L55
6+
//
7+
// In the context of torchtext, the logic is simpler at the moment.
8+
//
9+
// The torchtext custom operations are implemented in
10+
// `torchtext/lib/libtorchtext.[so|pyd]`. Some symbols are referred from
11+
// `torchtext._torchtext`.
12+
//
13+
// In Windows, default visibility of dynamically library are hidden, while in
14+
// Linux/macOS, they are visible.
15+
//
16+
// At the moment we do not expect torchtext libraries to be built/linked
17+
// statically. We assume they are always shared.
18+
19+
#ifdef _WIN32
20+
#define TORCHTEXT_EXPORT __declspec(dllexport)
21+
#define TORCHTEXT_IMPORT __declspec(dllimport)
22+
#else // _WIN32
23+
#if defined(__GNUC__)
24+
#define TORCHTEXT_EXPORT __attribute__((__visibility__("default")))
25+
#else // defined(__GNUC__)
26+
#define TORCHTEXT_EXPORT
27+
#endif // defined(__GNUC__)
28+
#define TORCHTEXT_IMPORT TORCHTEXT_EXPORT
29+
#endif // _WIN32
30+
31+
#ifdef TORCHTEXT_BUILD_MAIN_LIB
32+
#define TORCHTEXT_API TORCHTEXT_EXPORT
33+
#else
34+
#define TORCHTEXT_API TORCHTEXT_IMPORT
35+
#endif

torchtext/csrc/gpt2_bpe_tokenizer.h

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define GPT2_BPE_TOKENIZER_H_
33

44
#include <torch/script.h>
5+
#include <torchtext/csrc/export.h>
56

67
#include <cstdint>
78
#include <string>
@@ -79,7 +80,7 @@ struct GPT2BPEEncoder : torch::CustomClassHolder {
7980
const c10::Dict<int64_t, std::string>& byte_encoder,
8081
bool caching_enabled = false);
8182

82-
explicit GPT2BPEEncoder(
83+
TORCHTEXT_API explicit GPT2BPEEncoder(
8384
const std::unordered_map<std::string, int64_t>& bpe_encoder,
8485
const std::unordered_map<std::string, int64_t>& bpe_merge_ranks,
8586
const std::string& seperator,
@@ -97,20 +98,21 @@ struct GPT2BPEEncoder : torch::CustomClassHolder {
9798
// --> bpe encode --> bpe token ids: [707, 5927], [11], [707, 68]
9899
// --> result --> [707, 5927, 11, 707, 68]
99100
//
100-
std::vector<int64_t> Encode(const std::string& text);
101-
std::vector<std::string> Tokenize(const std::string& text);
101+
TORCHTEXT_API std::vector<int64_t> Encode(const std::string& text);
102+
TORCHTEXT_API std::vector<std::string> Tokenize(const std::string& text);
102103

103-
std::unordered_map<std::string, int64_t> GetBPEEncoder() const;
104-
std::unordered_map<std::string, int64_t> GetBPEMergeRanks() const;
105-
std::unordered_map<int64_t, std::string> GetByteEncoder() const;
104+
TORCHTEXT_API std::unordered_map<std::string, int64_t> GetBPEEncoder() const;
105+
TORCHTEXT_API std::unordered_map<std::string, int64_t> GetBPEMergeRanks()
106+
const;
107+
TORCHTEXT_API std::unordered_map<int64_t, std::string> GetByteEncoder() const;
106108
};
107109

108-
GPT2BPEEncoderStatesPybind _serialize_gpt2_bpe_encoder_pybind(
110+
TORCHTEXT_API GPT2BPEEncoderStatesPybind _serialize_gpt2_bpe_encoder_pybind(
109111
const c10::intrusive_ptr<GPT2BPEEncoder>& self);
110112
GPT2BPEEncoderStatesTorchbind _serialize_gpt2_bpe_encoder_torchbind(
111113
const c10::intrusive_ptr<GPT2BPEEncoder>& self);
112-
c10::intrusive_ptr<GPT2BPEEncoder> _deserialize_gpt2_bpe_encoder_pybind(
113-
GPT2BPEEncoderStatesPybind states);
114+
TORCHTEXT_API c10::intrusive_ptr<GPT2BPEEncoder>
115+
_deserialize_gpt2_bpe_encoder_pybind(GPT2BPEEncoderStatesPybind states);
114116
c10::intrusive_ptr<GPT2BPEEncoder> _deserialize_gpt2_bpe_encoder_torchbind(
115117
GPT2BPEEncoderStatesTorchbind states);
116118
} // namespace torchtext

torchtext/csrc/regex.h

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include <re2/re2.h>
22
#include <re2/stringpiece.h>
33
#include <torch/script.h>
4+
#include <torchtext/csrc/export.h>
45
#include <string>
56

67
namespace torchtext {
@@ -11,12 +12,14 @@ struct Regex : torch::CustomClassHolder {
1112
public:
1213
std::string re_str_;
1314

14-
Regex(const std::string& re_str);
15-
std::string Sub(std::string str, const std::string& repl) const;
16-
bool FindAndConsume(re2::StringPiece* input, std::string* text) const;
15+
TORCHTEXT_API Regex(const std::string& re_str);
16+
TORCHTEXT_API std::string Sub(std::string str, const std::string& repl) const;
17+
TORCHTEXT_API bool FindAndConsume(re2::StringPiece* input, std::string* text)
18+
const;
1719
};
1820

19-
std::string _serialize_regex(const c10::intrusive_ptr<Regex>& self);
20-
c10::intrusive_ptr<Regex> _deserialize_regex(std::string&& state);
21+
TORCHTEXT_API std::string _serialize_regex(
22+
const c10::intrusive_ptr<Regex>& self);
23+
TORCHTEXT_API c10::intrusive_ptr<Regex> _deserialize_regex(std::string&& state);
2124

2225
} // namespace torchtext

torchtext/csrc/regex_tokenizer.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include <re2/re2.h>
22
#include <torch/script.h>
3+
#include <torchtext/csrc/export.h>
34

45
namespace torchtext {
56

@@ -19,16 +20,16 @@ struct RegexTokenizer : torch::CustomClassHolder {
1920
std::vector<std::string> replacements_;
2021
bool to_lower_;
2122

22-
explicit RegexTokenizer(
23+
TORCHTEXT_API explicit RegexTokenizer(
2324
const std::vector<std::string>& patterns,
2425
const std::vector<std::string>& replacements,
2526
const bool to_lower);
26-
std::vector<std::string> forward(std::string str) const;
27+
TORCHTEXT_API std::vector<std::string> forward(std::string str) const;
2728
};
2829

29-
RegexTokenizerStates _serialize_regex_tokenizer(
30-
const c10::intrusive_ptr<RegexTokenizer>& self);
31-
c10::intrusive_ptr<RegexTokenizer> _deserialize_regex_tokenizer(
30+
TORCHTEXT_API RegexTokenizerStates
31+
_serialize_regex_tokenizer(const c10::intrusive_ptr<RegexTokenizer>& self);
32+
TORCHTEXT_API c10::intrusive_ptr<RegexTokenizer> _deserialize_regex_tokenizer(
3233
RegexTokenizerStates&& states);
3334

3435
} // namespace torchtext

torchtext/csrc/sentencepiece.h

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include <sentencepiece_processor.h>
22
#include <sentencepiece_trainer.h>
33
#include <torch/script.h>
4+
#include <torchtext/csrc/export.h>
45

56
namespace torchtext {
67

@@ -16,16 +17,19 @@ struct SentencePiece : torch::CustomClassHolder {
1617
// serialized model from this content_ member, thus it needs to be public.
1718
std::string content_;
1819

19-
explicit SentencePiece(const std::string& content);
20-
std::vector<std::string> Encode(const std::string& input) const;
21-
std::vector<int64_t> EncodeAsIds(const std::string& input) const;
22-
std::string DecodeIds(const std::vector<int64_t>& ids) const;
23-
std::vector<std::string> EncodeAsPieces(const std::string& input) const;
24-
std::string DecodePieces(const std::vector<std::string>& pieces) const;
25-
int64_t GetPieceSize() const;
26-
int64_t unk_id() const;
27-
int64_t PieceToId(const std::string& piece) const;
28-
std::string IdToPiece(const int64_t id) const;
20+
TORCHTEXT_API explicit SentencePiece(const std::string& content);
21+
TORCHTEXT_API std::vector<std::string> Encode(const std::string& input) const;
22+
TORCHTEXT_API std::vector<int64_t> EncodeAsIds(
23+
const std::string& input) const;
24+
TORCHTEXT_API std::string DecodeIds(const std::vector<int64_t>& ids) const;
25+
TORCHTEXT_API std::vector<std::string> EncodeAsPieces(
26+
const std::string& input) const;
27+
TORCHTEXT_API std::string DecodePieces(
28+
const std::vector<std::string>& pieces) const;
29+
TORCHTEXT_API int64_t GetPieceSize() const;
30+
TORCHTEXT_API int64_t unk_id() const;
31+
TORCHTEXT_API int64_t PieceToId(const std::string& piece) const;
32+
TORCHTEXT_API std::string IdToPiece(const int64_t id) const;
2933
};
3034

3135
void generate_sp_model(

torchtext/csrc/vectors.h

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include <torch/script.h>
2+
#include <torchtext/csrc/export.h>
23

34
namespace torchtext {
45

@@ -26,22 +27,28 @@ struct Vectors : torch::CustomClassHolder {
2627
const IndexMap& stoi,
2728
torch::Tensor vectors,
2829
torch::Tensor unk_tensor);
29-
explicit Vectors(
30+
TORCHTEXT_API explicit Vectors(
3031
const std::vector<std::string>& tokens,
3132
const std::vector<std::int64_t>& indices,
3233
torch::Tensor vectors,
3334
torch::Tensor unk_tensor);
34-
std::unordered_map<std::string, int64_t> get_stoi();
35-
torch::Tensor __getitem__(const std::string& token);
36-
torch::Tensor lookup_vectors(const std::vector<std::string>& tokens);
37-
void __setitem__(const std::string& token, const torch::Tensor& vector);
38-
int64_t __len__();
35+
TORCHTEXT_API std::unordered_map<std::string, int64_t> get_stoi();
36+
TORCHTEXT_API torch::Tensor __getitem__(const std::string& token);
37+
TORCHTEXT_API torch::Tensor lookup_vectors(
38+
const std::vector<std::string>& tokens);
39+
TORCHTEXT_API void __setitem__(
40+
const std::string& token,
41+
const torch::Tensor& vector);
42+
TORCHTEXT_API int64_t __len__();
3943
};
4044

41-
VectorsStates _serialize_vectors(const c10::intrusive_ptr<Vectors>& self);
42-
c10::intrusive_ptr<Vectors> _deserialize_vectors(VectorsStates states);
45+
TORCHTEXT_API VectorsStates
46+
_serialize_vectors(const c10::intrusive_ptr<Vectors>& self);
47+
TORCHTEXT_API c10::intrusive_ptr<Vectors> _deserialize_vectors(
48+
VectorsStates states);
4349

44-
std::tuple<Vectors, std::vector<std::string>> _load_token_and_vectors_from_file(
50+
TORCHTEXT_API std::tuple<Vectors, std::vector<std::string>>
51+
_load_token_and_vectors_from_file(
4552
const std::string& file_path,
4653
const std::string& delimiter_str,
4754
const int64_t num_cpus,

torchtext/csrc/vocab.h

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#pragma once
22
#include <c10/util/string_view.h>
33
#include <torch/script.h>
4+
#include <torchtext/csrc/export.h>
45
#include <algorithm>
56

67
namespace torchtext {
@@ -27,7 +28,7 @@ struct CompareTokens {
2728
}
2829
};
2930

30-
int64_t _infer_lines(const std::string& file_path);
31+
TORCHTEXT_API int64_t _infer_lines(const std::string& file_path);
3132

3233
struct Vocab : torch::CustomClassHolder {
3334
static const int32_t MAX_VOCAB_SIZE = 30000000;
@@ -40,23 +41,24 @@ struct Vocab : torch::CustomClassHolder {
4041
// TODO: [can we remove this?] we need to keep this constructor, otherwise
4142
// torch binding gets compilation error: no matching constructor for
4243
// initialization of 'torchtext::Vocab'
43-
explicit Vocab(StringList tokens);
44-
explicit Vocab(
44+
TORCHTEXT_API explicit Vocab(StringList tokens);
45+
TORCHTEXT_API explicit Vocab(
4546
StringList tokens,
4647
const c10::optional<int64_t>& default_index);
47-
int64_t __len__() const;
48-
int64_t __getitem__(const c10::string_view& token) const;
49-
bool __contains__(const c10::string_view& token) const;
50-
void set_default_index(c10::optional<int64_t> index);
51-
c10::optional<int64_t> get_default_index() const;
52-
void insert_token(std::string token, const int64_t& index);
53-
void append_token(std::string token);
54-
std::string lookup_token(const int64_t& index);
55-
std::vector<std::string> lookup_tokens(const std::vector<int64_t>& indices);
48+
TORCHTEXT_API int64_t __len__() const;
49+
TORCHTEXT_API int64_t __getitem__(const c10::string_view& token) const;
50+
TORCHTEXT_API bool __contains__(const c10::string_view& token) const;
51+
TORCHTEXT_API void set_default_index(c10::optional<int64_t> index);
52+
TORCHTEXT_API c10::optional<int64_t> get_default_index() const;
53+
TORCHTEXT_API void insert_token(std::string token, const int64_t& index);
54+
TORCHTEXT_API void append_token(std::string token);
55+
TORCHTEXT_API std::string lookup_token(const int64_t& index);
56+
TORCHTEXT_API std::vector<std::string> lookup_tokens(
57+
const std::vector<int64_t>& indices);
5658
std::vector<int64_t> lookup_indices(
5759
const std::vector<c10::string_view>& tokens);
58-
std::unordered_map<std::string, int64_t> get_stoi() const;
59-
std::vector<std::string> get_itos() const;
60+
TORCHTEXT_API std::unordered_map<std::string, int64_t> get_stoi() const;
61+
TORCHTEXT_API std::vector<std::string> get_itos() const;
6062

6163
protected:
6264
uint32_t _hash(const c10::string_view& str) const {
@@ -86,14 +88,15 @@ struct Vocab : torch::CustomClassHolder {
8688
}
8789
};
8890

89-
VocabStates _serialize_vocab(const c10::intrusive_ptr<Vocab>& self);
90-
c10::intrusive_ptr<Vocab> _deserialize_vocab(VocabStates states);
91+
TORCHTEXT_API VocabStates
92+
_serialize_vocab(const c10::intrusive_ptr<Vocab>& self);
93+
TORCHTEXT_API c10::intrusive_ptr<Vocab> _deserialize_vocab(VocabStates states);
9194

92-
Vocab _load_vocab_from_file(
95+
TORCHTEXT_API Vocab _load_vocab_from_file(
9396
const std::string& file_path,
9497
const int64_t min_freq,
9598
const int64_t num_cpus);
96-
Vocab _build_vocab_from_text_file(
99+
TORCHTEXT_API Vocab _build_vocab_from_text_file(
97100
const std::string& file_path,
98101
const int64_t min_freq,
99102
const int64_t num_cpus,

0 commit comments

Comments
 (0)