@@ -72,8 +72,8 @@ _Py_device_encoding(int fd)
7272
7373extern int _Py_normalize_encoding (const char * , char * , size_t );
7474
75- /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
76- On these operating systems, nl_langinfo(CODESET) announces an alias of the
75+ /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
76+ and POSIX locale. nl_langinfo(CODESET) announces an alias of the
7777 ASCII encoding, whereas mbstowcs() and wcstombs() functions use the
7878 ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use
7979 locale.getpreferredencoding() codec. For example, if command line arguments
@@ -86,6 +86,10 @@ extern int _Py_normalize_encoding(const char *, char *, size_t);
8686 workaround is also enabled on error, for example if getting the locale
8787 failed.
8888
89+ On HP-UX with the C locale or the POSIX locale, nl_langinfo(CODESET)
90+ announces "roman8" but mbstowcs() uses Latin1 in practice. Force also the
91+ ASCII encoding in this case.
92+
8993 Values of force_ascii:
9094
9195 1: the workaround is used: Py_EncodeLocale() uses
@@ -100,13 +104,46 @@ static int force_ascii = -1;
100104static int
101105check_force_ascii (void )
102106{
103- char * loc ;
107+ char * loc = setlocale (LC_CTYPE , NULL );
108+ if (loc == NULL ) {
109+ goto error ;
110+ }
111+ if (strcmp (loc , "C" ) != 0 && strcmp (loc , "POSIX" ) != 0 ) {
112+ /* the LC_CTYPE locale is different than C and POSIX */
113+ return 0 ;
114+ }
115+
104116#if defined(HAVE_LANGINFO_H ) && defined(CODESET )
105- char * codeset , * * alias ;
117+ const char * codeset = nl_langinfo (CODESET );
118+ if (!codeset || codeset [0 ] == '\0' ) {
119+ /* CODESET is not set or empty */
120+ goto error ;
121+ }
122+
106123 char encoding [20 ]; /* longest name: "iso_646.irv_1991\0" */
107- int is_ascii ;
108- unsigned int i ;
109- char * ascii_aliases [] = {
124+ if (!_Py_normalize_encoding (codeset , encoding , sizeof (encoding ))) {
125+ goto error ;
126+ }
127+
128+ #ifdef __hpux
129+ if (strcmp (encoding , "roman8" ) == 0 ) {
130+ unsigned char ch ;
131+ wchar_t wch ;
132+ size_t res ;
133+
134+ ch = (unsigned char )0xA7 ;
135+ res = mbstowcs (& wch , (char * )& ch , 1 );
136+ if (res != (size_t )-1 && wch == L'\xA7' ) {
137+ /* On HP-UX withe C locale or the POSIX locale,
138+ nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
139+ Latin1 encoding in practice. Force ASCII in this case.
140+
141+ Roman8 decodes 0xA7 to U+00CF. Latin1 decodes 0xA7 to U+00A7. */
142+ return 1 ;
143+ }
144+ }
145+ #else
146+ const char * ascii_aliases [] = {
110147 "ascii" ,
111148 /* Aliases from Lib/encodings/aliases.py */
112149 "646" ,
@@ -123,27 +160,9 @@ check_force_ascii(void)
123160 "us_ascii" ,
124161 NULL
125162 };
126- #endif
127-
128- loc = setlocale (LC_CTYPE , NULL );
129- if (loc == NULL )
130- goto error ;
131- if (strcmp (loc , "C" ) != 0 && strcmp (loc , "POSIX" ) != 0 ) {
132- /* the LC_CTYPE locale is different than C */
133- return 0 ;
134- }
135-
136- #if defined(HAVE_LANGINFO_H ) && defined(CODESET )
137- codeset = nl_langinfo (CODESET );
138- if (!codeset || codeset [0 ] == '\0' ) {
139- /* CODESET is not set or empty */
140- goto error ;
141- }
142- if (!_Py_normalize_encoding (codeset , encoding , sizeof (encoding )))
143- goto error ;
144163
145- is_ascii = 0 ;
146- for (alias = ascii_aliases ; * alias != NULL ; alias ++ ) {
164+ int is_ascii = 0 ;
165+ for (const char * * alias = ascii_aliases ; * alias != NULL ; alias ++ ) {
147166 if (strcmp (encoding , * alias ) == 0 ) {
148167 is_ascii = 1 ;
149168 break ;
@@ -154,13 +173,14 @@ check_force_ascii(void)
154173 return 0 ;
155174 }
156175
157- for (i = 0x80 ; i < 0xff ; i ++ ) {
158- unsigned char ch ;
159- wchar_t wch ;
176+ for (unsigned int i = 0x80 ; i <= 0xff ; i ++ ) {
177+ char ch [ 1 ] ;
178+ wchar_t wch [ 1 ] ;
160179 size_t res ;
161180
162- ch = (unsigned char )i ;
163- res = mbstowcs (& wch , (char * )& ch , 1 );
181+ unsigned uch = (unsigned char )i ;
182+ ch [0 ] = (char )uch ;
183+ res = mbstowcs (wch , ch , 1 );
164184 if (res != (size_t )-1 ) {
165185 /* decoding a non-ASCII character from the locale encoding succeed:
166186 the locale encoding is not ASCII, force ASCII */
@@ -169,17 +189,29 @@ check_force_ascii(void)
169189 }
170190 /* None of the bytes in the range 0x80-0xff can be decoded from the locale
171191 encoding: the locale encoding is really ASCII */
192+ #endif /* !defined(__hpux) */
172193 return 0 ;
173194#else
174195 /* nl_langinfo(CODESET) is not available: always force ASCII */
175196 return 1 ;
176- #endif
197+ #endif /* defined(HAVE_LANGINFO_H) && defined(CODESET) */
177198
178199error :
179200 /* if an error occurred, force the ASCII encoding */
180201 return 1 ;
181202}
182203
204+
205+ int
206+ _Py_GetForceASCII (void )
207+ {
208+ if (force_ascii == -1 ) {
209+ force_ascii = check_force_ascii ();
210+ }
211+ return force_ascii ;
212+ }
213+
214+
183215static int
184216encode_ascii (const wchar_t * text , char * * str ,
185217 size_t * error_pos , const char * * reason ,
@@ -234,6 +266,12 @@ encode_ascii(const wchar_t *text, char **str,
234266 * str = result ;
235267 return 0 ;
236268}
269+ #else
270+ int
271+ _Py_GetForceASCII (void )
272+ {
273+ return 0 ;
274+ }
237275#endif /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */
238276
239277
0 commit comments