diff --git a/CHANGELOG.md b/CHANGELOG.md index eb6774bcf..db0256ceb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **ID3v2**: - `Id3v2ErrorKind::UnsupportedFrameId` ([PR](https://github.com/Serial-ATA/lofty-rs/pull/212)) - `FrameValue::KeyValue` for TIPL/TMCL frames ([PR](https://github.com/Serial-ATA/lofty-rs/pull/214)) +- **ParseOptions**: `ParseOptions::max_junk_bytes`, allowing the parser to sift through junk bytes to find required information, rather than + immediately declare a file invalid. ([discussion](https://github.com/Serial-ATA/lofty-rs/discussions/219)) ([PR](https://github.com/Serial-ATA/lofty-rs/pull/227)) ## Changed - **ID3v2**: diff --git a/src/file.rs b/src/file.rs index 233722853..841461eaa 100644 --- a/src/file.rs +++ b/src/file.rs @@ -894,8 +894,8 @@ impl FileType { /// ``` pub fn from_buffer(buf: &[u8]) -> Option { match Self::from_buffer_inner(buf) { - (Some(f_ty), _) => Some(f_ty), - // We make no attempt to search past an ID3v2 tag here, since + FileTypeGuessResult::Determined(file_ty) => Some(file_ty), + // We make no attempt to search past an ID3v2 tag or junk here, since // we only provided a fixed-sized buffer to search from. // // That case is handled in `Probe::guess_file_type` @@ -904,28 +904,31 @@ impl FileType { } // TODO: APE tags in the beginning of the file - pub(crate) fn from_buffer_inner(buf: &[u8]) -> (Option, Option) { + pub(crate) fn from_buffer_inner(buf: &[u8]) -> FileTypeGuessResult { use crate::id3::v2::util::synchsafe::SynchsafeInteger; - // Start out with an empty return: (File type, id3 size) - // Only one can be set - let mut ret = (None, None); + // Start out with an empty return + let mut ret = FileTypeGuessResult::Undetermined; if buf.is_empty() { return ret; } match Self::quick_type_guess(buf) { - Some(f_ty) => ret.0 = Some(f_ty), + Some(f_ty) => ret = FileTypeGuessResult::Determined(f_ty), // Special case for ID3, gets checked in `Probe::guess_file_type` // The bare minimum size for an ID3v2 header is 10 bytes None if buf.len() >= 10 && &buf[..3] == b"ID3" => { // This is infallible, but preferable to an unwrap if let Ok(arr) = buf[6..10].try_into() { // Set the ID3v2 size - ret.1 = Some(u32::from_be_bytes(arr).unsynch()); + ret = + FileTypeGuessResult::MaybePrecededById3(u32::from_be_bytes(arr).unsynch()); } }, + None if buf.first().copied() == Some(0) => { + ret = FileTypeGuessResult::MaybePrecededByJunk + }, // We aren't able to determine a format _ => {}, } @@ -1008,3 +1011,18 @@ impl FileType { } } } + +/// The result of a `FileType` guess +/// +/// External callers of `FileType::from_buffer()` will only ever see `Determined` cases. +/// The remaining cases are used internally in `Probe::guess_file_type()`. +pub(crate) enum FileTypeGuessResult { + /// The `FileType` was guessed + Determined(FileType), + /// The stream starts with an ID3v2 tag + MaybePrecededById3(u32), + /// The stream starts with junk zero bytes + MaybePrecededByJunk, + /// The `FileType` could not be guessed + Undetermined, +} diff --git a/src/probe.rs b/src/probe.rs index f84403810..bdd14faff 100644 --- a/src/probe.rs +++ b/src/probe.rs @@ -1,7 +1,7 @@ use crate::aac::AacFile; use crate::ape::ApeFile; use crate::error::Result; -use crate::file::{AudioFile, FileType, TaggedFile}; +use crate::file::{AudioFile, FileType, FileTypeGuessResult, TaggedFile}; use crate::flac::FlacFile; use crate::iff::aiff::AiffFile; use crate::iff::wav::WavFile; @@ -20,6 +20,8 @@ use std::fs::File; use std::io::{BufReader, Cursor, Read, Seek, SeekFrom}; use std::path::Path; +const MAX_JUNK_BYTES: usize = 1024; + /// Options to control how Lofty parses a file #[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq)] #[non_exhaustive] @@ -27,6 +29,7 @@ pub struct ParseOptions { pub(crate) read_properties: bool, pub(crate) use_custom_resolvers: bool, pub(crate) parsing_mode: ParsingMode, + pub(crate) max_junk_bytes: usize, } impl Default for ParseOptions { @@ -38,7 +41,8 @@ impl Default for ParseOptions { /// ParseOptions { /// read_properties: true, /// use_custom_resolvers: true, - /// parsing_mode: ParsingMode::Strict, + /// parsing_mode: ParsingMode::BestAttempt, + /// max_junk_bytes: 1024 /// } /// ``` fn default() -> Self { @@ -64,6 +68,7 @@ impl ParseOptions { read_properties: true, use_custom_resolvers: true, parsing_mode: ParsingMode::BestAttempt, + max_junk_bytes: MAX_JUNK_BYTES, } } @@ -106,13 +111,31 @@ impl ParseOptions { /// ```rust /// use lofty::{ParseOptions, ParsingMode}; /// - /// // By default, `parsing_mode` is ParsingMode::Strict. Here, we don't need absolute correctness. - /// let parsing_options = ParseOptions::new().parsing_mode(ParsingMode::Relaxed); + /// // By default, `parsing_mode` is ParsingMode::BestAttempt. Here, we need absolute correctness. + /// let parsing_options = ParseOptions::new().parsing_mode(ParsingMode::Strict); /// ``` pub fn parsing_mode(&mut self, parsing_mode: ParsingMode) -> Self { self.parsing_mode = parsing_mode; *self } + + /// The maximum number of allowed junk bytes to search + /// + /// Some information may be surrounded by junk bytes, such as tag padding remnants. This sets the maximum + /// number of junk/unrecognized bytes Lofty will search for required information before giving up. + /// + /// # Examples + /// + /// ```rust + /// use lofty::ParseOptions; + /// + /// // I have files full of junk, I'll double the search window! + /// let parsing_options = ParseOptions::new().max_junk_bytes(2048); + /// ``` + pub fn max_junk_bytes(&mut self, max_junk_bytes: usize) -> Self { + self.max_junk_bytes = max_junk_bytes; + *self + } } /// The parsing strictness mode @@ -410,6 +433,8 @@ impl Probe { /// /// On success, the file type will be replaced /// + /// NOTE: This is influenced by `ParseOptions`, be sure to set it with `Probe::options()` prior to calling this. + /// /// # Errors /// /// All errors that occur within this function are [`std::io::Error`]. @@ -432,14 +457,18 @@ impl Probe { /// # Ok(()) } /// ``` pub fn guess_file_type(mut self) -> std::io::Result { - let f_ty = self.guess_inner()?; + let max_junk_bytes = self + .options + .map_or(MAX_JUNK_BYTES, |options| options.max_junk_bytes); + + let f_ty = self.guess_inner(max_junk_bytes)?; self.f_ty = f_ty.or(self.f_ty); Ok(self) } #[allow(clippy::shadow_unrelated)] - fn guess_inner(&mut self) -> std::io::Result> { + fn guess_inner(&mut self, max_junk_bytes: usize) -> std::io::Result> { // temporary buffer for storing 36 bytes // (36 is just a guess as to how long the data for estimating the file type might be) let mut buf = [0; 36]; @@ -456,9 +485,9 @@ impl Probe { // Guess the file type by using these 36 bytes match FileType::from_buffer_inner(&buf[..buf_len]) { // We were able to determine a file type - (Some(f_ty), _) => Ok(Some(f_ty)), + FileTypeGuessResult::Determined(file_ty) => Ok(Some(file_ty)), // The file starts with an ID3v2 tag; this means other data can follow (e.g. APE or MP3 frames) - (None, Some(id3_len)) => { + FileTypeGuessResult::MaybePrecededById3(id3_len) => { // `id3_len` is the size of the tag, not including the header (10 bytes) let position_after_id3_block = self .inner @@ -478,21 +507,7 @@ impl Probe { b"fLaC" => Ok(Some(FileType::Flac)), b"MPCK" | [b'M', b'P', b'+', ..] => Ok(Some(FileType::Mpc)), // Search for a frame sync, which may be preceded by junk - _ if search_for_frame_sync(&mut self.inner)?.is_some() => { - // Seek back to the start of the frame sync to check if we are dealing with - // an AAC or MPEG file. See `FileType::quick_type_guess` for explanation. - self.inner.seek(SeekFrom::Current(-2))?; - - let mut buf = [0; 2]; - self.inner.read_exact(&mut buf)?; - - if buf[1] & 0b10000 > 0 && buf[1] & 0b110 == 0 { - Ok(Some(FileType::Aac)) - } else { - Ok(Some(FileType::Mpeg)) - } - }, - _ => Ok(None), + _ => self.check_mpeg_or_aac(max_junk_bytes), }; // before returning any result for a file type, seek back to the front @@ -500,6 +515,15 @@ impl Probe { file_type_after_id3_block }, + // TODO: Check more than MPEG/AAC + FileTypeGuessResult::MaybePrecededByJunk => { + let ret = self.check_mpeg_or_aac(max_junk_bytes); + + // before returning any result for a file type, seek back to the front + self.inner.seek(SeekFrom::Start(starting_position))?; + + ret + }, _ => { if let Ok(lock) = CUSTOM_RESOLVERS.lock() { #[allow(clippy::significant_drop_in_scrutinee)] @@ -515,6 +539,29 @@ impl Probe { } } + /// Searches for an MPEG/AAC frame sync, which may be preceded by junk bytes + fn check_mpeg_or_aac(&mut self, max_junk_bytes: usize) -> std::io::Result> { + { + let mut restricted_reader = self.inner.by_ref().take(max_junk_bytes as u64); + if search_for_frame_sync(&mut restricted_reader)?.is_none() { + return Ok(None); + } + } + + // Seek back to the start of the frame sync to check if we are dealing with + // an AAC or MPEG file. See `FileType::quick_type_guess` for explanation. + self.inner.seek(SeekFrom::Current(-2))?; + + let mut buf = [0; 2]; + self.inner.read_exact(&mut buf)?; + + if buf[1] & 0b10000 > 0 && buf[1] & 0b110 == 0 { + Ok(Some(FileType::Aac)) + } else { + Ok(Some(FileType::Mpeg)) + } + } + /// Attempts to extract a [`TaggedFile`] from the reader /// /// If `read_properties` is false, the properties will be zeroed out. @@ -716,6 +763,11 @@ mod tests { test_probe("tests/files/assets/minimal/full_test.mp3", FileType::Mpeg); } + #[test] + fn probe_mp3_with_lots_of_junk() { + test_probe("tests/files/assets/junk.mp3", FileType::Mpeg); + } + #[test] fn probe_vorbis() { test_probe("tests/files/assets/minimal/full_test.ogg", FileType::Vorbis); diff --git a/tests/files/assets/junk.mp3 b/tests/files/assets/junk.mp3 new file mode 100644 index 000000000..9cab8441b Binary files /dev/null and b/tests/files/assets/junk.mp3 differ