@@ -117,27 +117,99 @@ use crate::vec::Vec;
117117///
118118/// # UTF-8
119119///
120- /// `String`s are always valid UTF-8. This has a few implications, the first of
121- /// which is that if you need a non-UTF-8 string, consider [`OsString`]. It is
122- /// similar, but without the UTF-8 constraint. The second implication is that
123- /// you cannot index into a `String`:
120+ /// `String`s are always valid UTF-8. If you need a non-UTF-8 string, consider
121+ /// [`OsString`]. It is similar, but without the UTF-8 constraint. Because UTF-8
122+ /// is a variable width encoding, `String`s are typically smaller than an array of
123+ /// the same `chars`:
124+ ///
125+ /// ```
126+ /// use std::mem;
127+ ///
128+ /// // `s` is ASCII which represents each `char` as one byte
129+ /// let s = "hello";
130+ /// assert_eq!(s.len(), 5);
131+ ///
132+ /// // A `char` array with the same contents would be longer because
133+ /// // every `char` is four bytes
134+ /// let s = ['h', 'e', 'l', 'l', 'o'];
135+ /// let size: usize = s.into_iter().map(|c| mem::size_of_val(&c)).sum();
136+ /// assert_eq!(size, 20);
137+ ///
138+ /// // However, for non-ASCII strings, the difference will be smaller
139+ /// // and sometimes they are the same
140+ /// let s = "💖💖💖💖💖";
141+ /// assert_eq!(s.len(), 20);
142+ ///
143+ /// let s = ['💖', '💖', '💖', '💖', '💖'];
144+ /// let size: usize = s.into_iter().map(|c| mem::size_of_val(&c)).sum();
145+ /// assert_eq!(size, 20);
146+ /// ```
147+ ///
148+ /// This raises interesting questions as to how `s[i]` should work.
149+ /// What should `i` be here? Several options include byte indices and
150+ /// `char` indices but, because of UTF-8 encoding, only byte indices
151+ /// would provide constant time indexing. Getting the `i`th `char`, for
152+ /// example, is available using [`chars`]:
153+ ///
154+ /// ```
155+ /// let s = "hello";
156+ /// let third_character = s.chars().nth(2);
157+ /// assert_eq!(third_character, Some('l'));
158+ ///
159+ /// let s = "💖💖💖💖💖";
160+ /// let third_character = s.chars().nth(2);
161+ /// assert_eq!(third_character, Some('💖'));
162+ /// ```
163+ ///
164+ /// Next, what should `s[i]` return? Because indexing returns a reference
165+ /// to underlying data it could be `&u8`, `&[u8]`, or something else similar.
166+ /// Since we're only providing one index, `&u8` makes the most sense but that
167+ /// might not be what the user expects and can be explicitly achieved with
168+ /// [`as_bytes()`]:
169+ ///
170+ /// ```
171+ /// // The first byte is 104 - the byte value of `'h'`
172+ /// let s = "hello";
173+ /// assert_eq!(s.as_bytes()[0], 104);
174+ /// // or
175+ /// assert_eq!(s.as_bytes()[0], b'h');
176+ ///
177+ /// // The first byte is 240 which isn't obviously useful
178+ /// let s = "💖💖💖💖💖";
179+ /// assert_eq!(s.as_bytes()[0], 240);
180+ /// ```
181+ ///
182+ /// Due to these ambiguities/restrictions, indexing with a `usize` is simply
183+ /// forbidden:
124184///
125185/// ```compile_fail,E0277
126186/// let s = "hello";
127187///
128- /// println!("The first letter of s is {}", s[0]); // ERROR!!!
188+ /// // The following will not compile!
189+ /// println!("The first letter of s is {}", s[0]);
129190/// ```
130191///
192+ /// It is more clear, however, how `&s[i..j]` should work (that is,
193+ /// indexing with a range). It should accept byte indices (to be constant-time)
194+ /// and return a `&str` which is UTF-8 encoded. This is also called "string slicing".
195+ /// Note this will panic if the byte indices provided are not character
196+ /// boundaries - see [`is_char_boundary`] for more details. See the implementations
197+ /// for [`SliceIndex<str>`] for more details on string slicing. For a non-panicking
198+ /// version of string slicing, see [`get`].
199+ ///
131200/// [`OsString`]: ../../std/ffi/struct.OsString.html "ffi::OsString"
201+ /// [`SliceIndex<str>`]: core::slice::SliceIndex
202+ /// [`as_bytes()`]: str::as_bytes
203+ /// [`get`]: str::get
204+ /// [`is_char_boundary`]: str::is_char_boundary
132205///
133- /// Indexing is intended to be a constant-time operation, but UTF-8 encoding
134- /// does not allow us to do this. Furthermore, it's not clear what sort of
135- /// thing the index should return: a byte, a codepoint, or a grapheme cluster.
136- /// The [`bytes`] and [`chars`] methods return iterators over the first
137- /// two, respectively.
206+ /// The [`bytes`] and [`chars`] methods return iterators over the bytes and
207+ /// codepoints of the string, respectively. To iterate over codepoints along
208+ /// with byte indices, use [`char_indices`].
138209///
139210/// [`bytes`]: str::bytes
140211/// [`chars`]: str::chars
212+ /// [`char_indices`]: str::char_indices
141213///
142214/// # Deref
143215///
0 commit comments