From 0e8ad5759c622d8fd8125cf316e6656206c7a4bc Mon Sep 17 00:00:00 2001 From: localthomas <81923579+localthomas@users.noreply.github.com> Date: Fri, 21 Jan 2022 13:55:28 +0100 Subject: [PATCH 1/7] added searching for frame sync bits when file type probing When a file contains an ID3 block, the following data might be an MP3 frame. The frame might be prepended by junk bytes, which are skipped with this commit. --- src/mp3/header.rs | 48 +++++++++++++++++++++ src/probe.rs | 103 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 126 insertions(+), 25 deletions(-) diff --git a/src/mp3/header.rs b/src/mp3/header.rs index acb83e47a..9ee9701de 100644 --- a/src/mp3/header.rs +++ b/src/mp3/header.rs @@ -9,6 +9,36 @@ pub(crate) fn verify_frame_sync(frame_sync: [u8; 2]) -> bool { frame_sync[0] == 0xFF && frame_sync[1] >> 5 == 0b111 } +/// Searches for a frame sync (11 bits with the value 1 like `0b1111_1111_111`) in the input reader. +/// The search starts at the beginning of the reader and returns the index relative to this beginning. +/// Only the first match is returned and on no match, [`None`] is returned instead. +/// +/// Note that the search searches in 8 bit steps, i.e. the first 8 bits need to be byte aligned. +pub(crate) fn search_for_frame_sync(input: &mut dyn Read) -> Result> { + let mut index = 0u64; + let mut iterator = input.bytes(); + let mut buffer = [0u8; 2]; + // Read the first byte, as each iteration expects that buffer 0 was set from a previous iteration. + // This is not the case in the first iteration, which is therefore a special case. + if let Some(byte) = iterator.next() { + buffer[0] = byte?; + } + // create a stream of overlapping 2 byte pairs + // example: [0x01, 0x02, 0x03, 0x04] should be analyzed as + // [0x01, 0x02], [0x02, 0x03], [0x03, 0x04] + while let Some(byte) = iterator.next() { + buffer[1] = byte?; + // check the two bytes in the buffer + if verify_frame_sync(buffer) { + return Ok(Some(index)); + } + // if they do not match, copy the last byte in the buffer to the front for the next iteration + buffer[0] = buffer[1]; + index += 1; + } + Ok(None) +} + #[derive(PartialEq, Copy, Clone, Debug)] #[allow(missing_docs)] /// MPEG Audio version @@ -200,3 +230,21 @@ impl XingHeader { } } } + +#[cfg(test)] +mod tests { + #[test] + fn search_for_frame_sync() { + fn test(data: &[u8], expected_result: Option) { + use super::search_for_frame_sync; + assert_eq!( + search_for_frame_sync(&mut Box::new(data)).unwrap(), + expected_result + ); + } + + test(&[0xFF, 0xFB, 0x00], Some(0)); + test(&[0x00, 0x00, 0x01, 0xFF, 0xFB], Some(3)); + test(&[0x01, 0xFF], None); + } +} diff --git a/src/probe.rs b/src/probe.rs index fd6a016c3..97883c205 100644 --- a/src/probe.rs +++ b/src/probe.rs @@ -2,7 +2,7 @@ use crate::ape::ApeFile; use crate::error::{LoftyError, Result}; use crate::iff::aiff::AiffFile; use crate::iff::wav::WavFile; -use crate::mp3::header::verify_frame_sync; +use crate::mp3::header::search_for_frame_sync; use crate::mp3::Mp3File; use crate::mp4::Mp4File; use crate::ogg::flac::FlacFile; @@ -149,41 +149,62 @@ impl Probe { #[allow(clippy::shadow_unrelated)] fn guess_inner(&mut self) -> Result> { + // temporary buffer for storing 36 bytes + // (36 is just a guess as to how long the data for estimating the file type might be) let mut buf = [0; 36]; - let pos = self.inner.seek(SeekFrom::Current(0))?; + // read the first 36 bytes and seek back to the starting position + let starting_position = self.inner.stream_position()?; let buf_len = std::io::copy( - &mut self.inner.by_ref().take(36), + &mut self.inner.by_ref().take(buf.len() as u64), &mut Cursor::new(&mut buf[..]), )? as usize; + self.inner.seek(SeekFrom::Start(starting_position))?; - self.inner.seek(SeekFrom::Start(pos))?; - + // estimate the file type by using these 36 bytes + // note that any error from `from_buffer_inner` are suppressed, as it returns an error on unknown format + // - TODO: why is the case `Err(LoftyError::UnknownFormat)` suppressed, but only for `from_buffer_inner`? + // - What is the special meaning of the return type `Ok(None)` vs `Err(LoftyError::UnknownFormat)`? match FileType::from_buffer_inner(&buf[..buf_len]) { + // the file type was guessed based on these bytes Ok((Some(f_ty), _)) => Ok(Some(f_ty)), + // the first data block is ID3 data; this means other data can follow (e.g. APE or MP3 frames) Ok((None, id3_len)) => { - self.inner + // the position right after the ID3 block is the internal size value (id3_len) + // added to the length of the ID3 header (which is 10 bytes), + // as the size does not include the header itself + let position_after_id3_block = self + .inner .seek(SeekFrom::Current(i64::from(10 + id3_len)))?; - let mut ident = [0; 3]; - let buf_len = std::io::copy( - &mut self.inner.by_ref().take(3), - &mut Cursor::new(&mut ident[..]), - )?; - - self.inner.seek(SeekFrom::Start(pos))?; - - if buf_len < 3 { - return Err(LoftyError::UnknownFormat); - } - - if &ident == b"MAC" { - Ok(Some(FileType::APE)) - } else if verify_frame_sync([ident[0], ident[1]]) { - Ok(Some(FileType::MP3)) - } else { - Err(LoftyError::UnknownFormat) - } + let file_type_after_id3_block = { + // try to guess the file type after the ID3 block by inspecting the first 3 bytes + let mut ident = [0; 3]; + let buf_len = std::io::copy( + &mut self.inner.by_ref().take(ident.len() as u64), + &mut Cursor::new(&mut ident[..]), + )?; + + if buf_len < 3 { + Err(LoftyError::UnknownFormat) + } else if &ident == b"MAC" { + Ok(Some(FileType::APE)) + } else { + // potentially some junk bytes are between the ID3 block and the following MP3 block + // search for any possible sync bits after the ID3 block + self.inner.seek(SeekFrom::Start(position_after_id3_block))?; + if let Some(_) = search_for_frame_sync(&mut self.inner)? { + Ok(Some(FileType::MP3)) + } else { + Err(LoftyError::UnknownFormat) + } + } + }; + + // before returning any result for a file type, seek back to the front + self.inner.seek(SeekFrom::Start(starting_position))?; + + file_type_after_id3_block }, _ => Ok(None), } @@ -247,3 +268,35 @@ where { Probe::open(path)?.read(read_properties) } + +#[cfg(test)] +mod tests { + use crate::Probe; + + #[test] + fn mp3_file_id3v2_3() { + // test data that contains 4 bytes of junk (0x20) between the ID3 portion and the first MP3 frame + let data: [&[u8]; 4] = [ + // ID3v2.3 header (10 bytes) + &[0x49, 0x44, 0x33, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23], + // TALB frame + &[ + 0x54, 0x41, 0x4C, 0x42, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x01, 0xFF, 0xFE, 0x61, + 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, + 0x00, 0x61, 0x00, 0x61, 0x00, 0x61, 0x00, + ], + // 4 bytes of junk + &[0x20, 0x20, 0x20, 0x20], + // start of MP3 frame (not all bytes are shown in this slice) + &[ + 0xFF, 0xFB, 0x50, 0xC4, 0x00, 0x03, 0xC0, 0x00, 0x01, 0xA4, 0x00, 0x00, 0x00, 0x20, + 0x00, 0x00, 0x34, 0x80, 0x00, 0x00, 0x04, + ], + ]; + let data: Vec = data.into_iter().flatten().cloned().collect(); + let data = std::io::Cursor::new(&data); + let probe = Probe::new(data); + let probe = probe.guess_file_type().unwrap(); + matches!(probe.file_type(), Some(crate::FileType::MP3)); + } +} From 47a28402dbcff6f19e132c724bc355e2911a300f Mon Sep 17 00:00:00 2001 From: localthomas <81923579+localthomas@users.noreply.github.com> Date: Fri, 21 Jan 2022 14:52:32 +0100 Subject: [PATCH 2/7] added frame sync search for MP3 reading The first MP3 frame behind metadata blocks is found by searching for frame sync bits. This skips junk bytes between any metadata blocks and the first MP3 frame. --- src/mp3/read.rs | 37 +++++++++++++++++++++++++------------ tests/files/assets/b.mp3 | Bin 0 -> 1458 bytes tests/files/mpeg.rs | 25 ++++++++++++++++++++++++- 3 files changed, 49 insertions(+), 13 deletions(-) create mode 100644 tests/files/assets/b.mp3 diff --git a/src/mp3/read.rs b/src/mp3/read.rs index 7d1c8241a..e923ec9dd 100644 --- a/src/mp3/read.rs +++ b/src/mp3/read.rs @@ -1,4 +1,4 @@ -use super::header::{verify_frame_sync, Header, XingHeader}; +use super::header::{search_for_frame_sync, Header, XingHeader}; use super::{Mp3File, Mp3Properties}; use crate::ape::constants::APE_PREAMBLE; #[cfg(feature = "ape")] @@ -12,7 +12,7 @@ use crate::id3::{find_id3v1, find_lyrics3v2, ID3FindResults}; use std::io::{Read, Seek, SeekFrom}; -use byteorder::ReadBytesExt; +use byteorder::{BigEndian, ReadBytesExt}; pub(super) fn read_from( reader: &mut R, @@ -80,17 +80,30 @@ where continue; } }, - _ if verify_frame_sync([header[0], header[1]]) => { - let start = reader.seek(SeekFrom::Current(0))? - 4; - let header = Header::read(u32::from_be_bytes(header))?; - - file.first_frame_offset = Some(start); - first_frame_header = Some(header); - - // We have found the first frame - break; + // metadata blocks might be followed by junk bytes before the first MP3 frame begins + _ => { + // seek back the length of the temporary header buffer + // so that all bytes are included in the search for a frame sync + let start_of_search_area = + reader.seek(SeekFrom::Current(-1 * header.len() as i64))?; + if let Some(first_mp3_frame_start_relative) = search_for_frame_sync(reader)? { + let first_mp3_frame_start_absolute = + start_of_search_area + first_mp3_frame_start_relative; + + // read the first four bytes of the found frame + reader.seek(SeekFrom::Start(first_mp3_frame_start_absolute))?; + let header = Header::read(reader.read_u32::()?)?; + + file.first_frame_offset = Some(first_mp3_frame_start_absolute); + first_frame_header = Some(header); + + // We have found the first frame + break; + } else { + // the search for sync bits was unsuccessful + return Err(LoftyError::Mp3("File contains an invalid frame")); + } }, - _ => return Err(LoftyError::Mp3("File contains an invalid frame")), } } diff --git a/tests/files/assets/b.mp3 b/tests/files/assets/b.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..b8fe60bf4cbc3830b1ae59945cc7b49ad7947243 GIT binary patch literal 1458 zcmeZtF=l1}0w%!_M;|93Lz01k@&CU>h8%_@hEj%H1_g!^hE#@PAj!ZG65whGq@rg?@K0V#-uAd^af zmZqXu2$UW;pa2B_e+LkBfM;G>K2Q-q5Hm3_un05EBo+K88Gsz+lUSB)YN2NciV`?b zVP%J~S%f(bd#Az0p^X2(C!{d&e_#NHHUk3(0|T!DFq(j936Kk9nlu1C;N$4)YOH5y zU=TX;z<~Sx|L;jD4*mEeXyir9VDfQ@qkBnaNlvOlNop~uP{aUhK05%tl5dZ)H literal 0 HcmV?d00001 diff --git a/tests/files/mpeg.rs b/tests/files/mpeg.rs index 164a6b7e3..7972f7e4d 100644 --- a/tests/files/mpeg.rs +++ b/tests/files/mpeg.rs @@ -1,5 +1,5 @@ use crate::{set_artist, temp_file, verify_artist}; -use lofty::{FileType, ItemKey, ItemValue, TagItem, TagType}; +use lofty::{Accessor, FileType, ItemKey, ItemValue, TagItem, TagType}; use std::io::{Seek, SeekFrom, Write}; #[test] @@ -19,6 +19,29 @@ fn read() { crate::verify_artist!(file, tag, TagType::Ape, "Baz artist", 1); } +#[test] +fn read_with_junk_bytes_between_frames() { + // Read a file that includes an ID3v2.3 data block followed by four bytes of junk data (0x20) + let file = lofty::read_from_path("tests/files/assets/b.mp3", true).unwrap(); + + // note that the file contains ID3v2 and ID3v1 data + assert_eq!(file.file_type(), &FileType::MP3); + + let id3v2_tag = &file.tags()[0]; + assert_eq!(id3v2_tag.artist(), Some("artist test")); + assert_eq!(id3v2_tag.album(), Some("album test")); + assert_eq!(id3v2_tag.title(), Some("title test")); + assert_eq!( + id3v2_tag.get_string(&ItemKey::EncoderSettings), + Some("Lavf58.62.100") + ); + + let id3v1_tag = &file.tags()[1]; + assert_eq!(id3v1_tag.artist(), Some("artist test")); + assert_eq!(id3v1_tag.album(), Some("album test")); + assert_eq!(id3v1_tag.title(), Some("title test")); +} + #[test] fn write() { let mut file = temp_file!("tests/files/assets/a.mp3"); From 3783bfbbb883432fa45bf718988564fa12cc606b Mon Sep 17 00:00:00 2001 From: localthomas <81923579+localthomas@users.noreply.github.com> Date: Fri, 21 Jan 2022 17:21:47 +0100 Subject: [PATCH 3/7] renamed test asset file --- .../assets/{b.mp3 => junk_between_id3_and_mp3.mp3} | Bin tests/files/mpeg.rs | 3 ++- 2 files changed, 2 insertions(+), 1 deletion(-) rename tests/files/assets/{b.mp3 => junk_between_id3_and_mp3.mp3} (100%) diff --git a/tests/files/assets/b.mp3 b/tests/files/assets/junk_between_id3_and_mp3.mp3 similarity index 100% rename from tests/files/assets/b.mp3 rename to tests/files/assets/junk_between_id3_and_mp3.mp3 diff --git a/tests/files/mpeg.rs b/tests/files/mpeg.rs index 7972f7e4d..929c7d92b 100644 --- a/tests/files/mpeg.rs +++ b/tests/files/mpeg.rs @@ -22,7 +22,8 @@ fn read() { #[test] fn read_with_junk_bytes_between_frames() { // Read a file that includes an ID3v2.3 data block followed by four bytes of junk data (0x20) - let file = lofty::read_from_path("tests/files/assets/b.mp3", true).unwrap(); + let file = + lofty::read_from_path("tests/files/assets/junk_between_id3_and_mp3.mp3", true).unwrap(); // note that the file contains ID3v2 and ID3v1 data assert_eq!(file.file_type(), &FileType::MP3); From 134d91d61af9bd1376d1c881433a03cef6fea1e0 Mon Sep 17 00:00:00 2001 From: localthomas <81923579+localthomas@users.noreply.github.com> Date: Fri, 21 Jan 2022 17:31:14 +0100 Subject: [PATCH 4/7] refactor after code review --- src/mp3/header.rs | 5 ++++- src/probe.rs | 5 +---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mp3/header.rs b/src/mp3/header.rs index 9ee9701de..240881c62 100644 --- a/src/mp3/header.rs +++ b/src/mp3/header.rs @@ -14,7 +14,10 @@ pub(crate) fn verify_frame_sync(frame_sync: [u8; 2]) -> bool { /// Only the first match is returned and on no match, [`None`] is returned instead. /// /// Note that the search searches in 8 bit steps, i.e. the first 8 bits need to be byte aligned. -pub(crate) fn search_for_frame_sync(input: &mut dyn Read) -> Result> { +pub(crate) fn search_for_frame_sync(input: &mut R) -> Result> +where + R: Read, +{ let mut index = 0u64; let mut iterator = input.bytes(); let mut buffer = [0u8; 2]; diff --git a/src/probe.rs b/src/probe.rs index 97883c205..dd99f950e 100644 --- a/src/probe.rs +++ b/src/probe.rs @@ -163,8 +163,6 @@ impl Probe { // estimate the file type by using these 36 bytes // note that any error from `from_buffer_inner` are suppressed, as it returns an error on unknown format - // - TODO: why is the case `Err(LoftyError::UnknownFormat)` suppressed, but only for `from_buffer_inner`? - // - What is the special meaning of the return type `Ok(None)` vs `Err(LoftyError::UnknownFormat)`? match FileType::from_buffer_inner(&buf[..buf_len]) { // the file type was guessed based on these bytes Ok((Some(f_ty), _)) => Ok(Some(f_ty)), @@ -295,8 +293,7 @@ mod tests { ]; let data: Vec = data.into_iter().flatten().cloned().collect(); let data = std::io::Cursor::new(&data); - let probe = Probe::new(data); - let probe = probe.guess_file_type().unwrap(); + let probe = Probe::new(data).guess_file_type().unwrap(); matches!(probe.file_type(), Some(crate::FileType::MP3)); } } From e1a8e08a3fc9e0eef52e7a2f624f65307b907b28 Mon Sep 17 00:00:00 2001 From: localthomas <81923579+localthomas@users.noreply.github.com> Date: Fri, 21 Jan 2022 17:34:49 +0100 Subject: [PATCH 5/7] return type of `guess_inner` is now `std::io::Result` --- src/mp3/header.rs | 2 +- src/probe.rs | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/mp3/header.rs b/src/mp3/header.rs index 240881c62..a5de13f59 100644 --- a/src/mp3/header.rs +++ b/src/mp3/header.rs @@ -14,7 +14,7 @@ pub(crate) fn verify_frame_sync(frame_sync: [u8; 2]) -> bool { /// Only the first match is returned and on no match, [`None`] is returned instead. /// /// Note that the search searches in 8 bit steps, i.e. the first 8 bits need to be byte aligned. -pub(crate) fn search_for_frame_sync(input: &mut R) -> Result> +pub(crate) fn search_for_frame_sync(input: &mut R) -> std::io::Result> where R: Read, { diff --git a/src/probe.rs b/src/probe.rs index dd99f950e..47df2f897 100644 --- a/src/probe.rs +++ b/src/probe.rs @@ -148,7 +148,7 @@ impl Probe { } #[allow(clippy::shadow_unrelated)] - fn guess_inner(&mut self) -> Result> { + fn guess_inner(&mut self) -> std::io::Result> { // temporary buffer for storing 36 bytes // (36 is just a guess as to how long the data for estimating the file type might be) let mut buf = [0; 36]; @@ -178,14 +178,12 @@ impl Probe { let file_type_after_id3_block = { // try to guess the file type after the ID3 block by inspecting the first 3 bytes let mut ident = [0; 3]; - let buf_len = std::io::copy( + std::io::copy( &mut self.inner.by_ref().take(ident.len() as u64), &mut Cursor::new(&mut ident[..]), )?; - if buf_len < 3 { - Err(LoftyError::UnknownFormat) - } else if &ident == b"MAC" { + if &ident == b"MAC" { Ok(Some(FileType::APE)) } else { // potentially some junk bytes are between the ID3 block and the following MP3 block @@ -194,7 +192,7 @@ impl Probe { if let Some(_) = search_for_frame_sync(&mut self.inner)? { Ok(Some(FileType::MP3)) } else { - Err(LoftyError::UnknownFormat) + Ok(None) } } }; From a326788819dcb765ca923046bb001cf4fa0226c3 Mon Sep 17 00:00:00 2001 From: localthomas <81923579+localthomas@users.noreply.github.com> Date: Fri, 21 Jan 2022 17:42:01 +0100 Subject: [PATCH 6/7] implemented clippy suggestions --- src/mp3/header.rs | 6 ++---- src/mp3/read.rs | 7 +++---- src/probe.rs | 2 +- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/mp3/header.rs b/src/mp3/header.rs index a5de13f59..49924a070 100644 --- a/src/mp3/header.rs +++ b/src/mp3/header.rs @@ -18,7 +18,6 @@ pub(crate) fn search_for_frame_sync(input: &mut R) -> std::io::Result Probe { // potentially some junk bytes are between the ID3 block and the following MP3 block // search for any possible sync bits after the ID3 block self.inner.seek(SeekFrom::Start(position_after_id3_block))?; - if let Some(_) = search_for_frame_sync(&mut self.inner)? { + if search_for_frame_sync(&mut self.inner)?.is_some() { Ok(Some(FileType::MP3)) } else { Ok(None) From b9c469173a5b4a96567e9241856500f13fa75959 Mon Sep 17 00:00:00 2001 From: localthomas <81923579+localthomas@users.noreply.github.com> Date: Fri, 21 Jan 2022 18:15:05 +0100 Subject: [PATCH 7/7] negative multiply made more readable --- src/mp3/read.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mp3/read.rs b/src/mp3/read.rs index 93140bd78..10bc466df 100644 --- a/src/mp3/read.rs +++ b/src/mp3/read.rs @@ -84,8 +84,8 @@ where _ => { // seek back the length of the temporary header buffer // so that all bytes are included in the search for a frame sync - let start_of_search_area = - reader.seek(SeekFrom::Current(-(header.len() as i64)))?; + #[allow(clippy::neg_multiply)] + let start_of_search_area = reader.seek(SeekFrom::Current(-1 * header.len() as i64))?; if let Some(first_mp3_frame_start_relative) = search_for_frame_sync(reader)? { let first_mp3_frame_start_absolute = start_of_search_area + first_mp3_frame_start_relative;