use core::{cmp, fmt, mem, str}; use proc_macro::{Literal, TokenStream, TokenTree}; /// Compile time pattern parser. /// /// ```ignore /// const PATTERN: &[pelite::pattern::Atom] = pattern!("pattern string"); /// ``` pub fn proc_pattern(input: TokenStream) -> TokenStream { let input = input.into_iter().collect::>(); let string = match &input[..] { [TokenTree::Literal(lit)] => parse_str_literal(&lit), _e => panic!("expected a single string literal to parse, got: {_e:?}"), }; let pattern = match parse(&string) { Ok(pattern) => pattern, Err(err) => panic!("invalid pattern syntax: {}", err), }; format!("{{ use x::xpat::Atom::*; &{:?} as x::Pattern }}", pattern).parse().unwrap() } fn parse_str_literal(input: &Literal) -> String { let input = input.to_string(); let mut chars = input.chars(); let mut string = String::new(); if chars.next() != Some('"') { panic!("expected string literal starting with a `\"` and no extraneous whitespace"); } loop { let chr = match chars.next() { Some('\\') => { match chars.next() { Some('\\') => '\\', Some('\'') => '\'', Some('\"') => '\"', Some('t') => '\t', Some('r') => '\r', Some('n') => '\n', Some('u') => panic!("unicode escape sequence not supported"), Some(chr) => panic!("unknown escape sequence: {}", chr), None => panic!(""), } }, Some('"') => break, Some(chr) => chr, None => panic!("unexpected end of string literal, missing `\"` terminator?"), }; string.push(chr); } string } /// Special skip value to indicate to use platform pointer size instead. pub(crate) const PTR_SKIP: u8 = 0; /// Pattern parsing error. #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub struct ParsePatError { kind: PatError, position: usize, } impl fmt::Display for ParsePatError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "Syntax Error @{}: {}.", self.position, self.kind.to_str()) } } #[derive(Copy, Clone, Debug, Eq, PartialEq)] enum PatError { UnpairedHexDigit, UnknownChar, ManyOverflow, ManyRange, ManyInvalid, SaveOverflow, StackError, StackInvalid, UnclosedQuote, AlignedOperand, CheckOperand, ReadOperand, SubPattern, SubOverflow, DoubleNibble } impl PatError { fn to_str(self) -> &'static str { match self { PatError::UnpairedHexDigit => "unpaired hex digit", PatError::UnknownChar => "unknown character", PatError::ManyOverflow => "many range exceeded", PatError::ManyRange => "many bounds nonsensical", PatError::ManyInvalid => "many invalid syntax", PatError::SaveOverflow => "save store overflow", PatError::StackError => "stack unbalanced", PatError::StackInvalid => "stack must follow jump", PatError::UnclosedQuote => "string missing end quote", PatError::AlignedOperand => "aligned operand error", PatError::CheckOperand => "aligned operand error", PatError::ReadOperand => "read operand error", PatError::SubPattern => "sub pattern error", PatError::SubOverflow => "sub pattern too large", PatError::DoubleNibble => "unpaired nibble wildcard", } } } //---------------------------------------------------------------- include!("../../xpat/src/atoms.rs"); /// Pattern parser. /// /// # Remarks /// /// Following are examples of the pattern syntax. /// The syntax takes inspiration from [YARA hexadecimal strings](https://yara.readthedocs.io/en/v3.7.0/writingrules.html#hexadecimal-strings). /// /// ```text /// 55 89 e5 83 ? ec /// ``` /// /// Case insensitive hexadecimal characters match the exact byte pattern and question marks serve as placeholders for unknown bytes. /// /// Note that a single question mark matches a whole byte. The syntax to mask part of a byte is not yet available. /// /// Spaces (code point 32) are completely optional and carry no semantic meaning, their purpose is to visually group things together. /// /// ```text /// b9 ' 37 13 00 00 /// ``` /// /// Single quotes are used as a bookmarks, to save the current cursor rva in the save array passed to the scanner. /// /// It is no longer necessary to do tedious address calculations to read information out of the byte stream after a match was found. /// This power really comes to life with the capability to follow relative and absolute references. /// /// The first entry in the save array is reserved for the rva where the pattern was matched. /// The rest of the save array is filled in order of appearance of the quotes. Here the rva of the quote can be found in `save[1]`. /// /// ```text /// b8 [16] 50 [13-42] ff /// ``` /// /// Pairs of decimal numbers separated by a hypen in square brackets indicate the lower and upper bound of number of bytes to skip. /// The scanner is non greedy and considers the first match while skipping as little as possible. /// /// A single decimal number in square brackets without hypens is a fixed size jump, equivalent to writing that number of consecutive question marks. /// /// ```text /// 31 c0 74 % ' c3 /// e8 $ ' 31 c0 c3 /// 68 * ' 31 c0 c3 /// ``` /// /// These symbols are used to follow; a signed 1 byte relative jump: `%`, a signed 4 byte relative jump: `$` and an absolute pointer: `*`. /// /// They are designed to be able to have the scanner follow short jumps, calls and longer jumps, and absolute pointers. /// /// Composes really well with bookmarks to find the addresses of referenced functions and other data without tedious address calculations. /// /// ```text /// b8 * "STRING" 00 /// ``` /// /// String literals appear in double quotes and will be matched as UTF-8. /// /// Escape sequences are not supported, switch back to matching with hex digits as needed. /// For UTF-16 support, you are welcome to send a PR. /// /// ```text /// e8 $ { ' } 83 f0 5c c3 /// ``` /// /// Curly braces must follow a jump symbol (see above). /// /// The sub pattern enclosed within the curly braces is matched at the destination after following the jump. /// After the pattern successfully matched, the cursor returns to before the jump was followed. /// The bytes defining the jump are skipped and matching continues again from here. /// /// ```text /// e8 $ @4 /// ``` /// /// Checks that the cursor is aligned at this point in the scan. /// The align value is `(1 << arg)`, in this example the cursor is checked to be aligned to 16. /// /// ```text /// e8 i1 a0 u4 /// ``` /// /// An `i` or `u` indicates memory read operations followed by the size of the operand to read. /// /// The read values are stored in the save array alongside the bookmarked addresses (single quotes). /// This means the values are sign- or zero- extended respectively before being stored. /// Operand sizes are 1 (byte), 2 (word) or 4 (dword). /// /// The cursor is advanced by the size of the operand. /// /// ```text /// 83 c0 2a ( 6a ? | 68 ? ? ? ? ) e8 /// ``` /// /// Parentheses indicate alternate subpatterns separated by a pipe character. /// /// The scanner attempts to match the alternate subpatterns from left to right and fails if none of them match. pub fn parse(pat: &str) -> Result, ParsePatError> { let mut result = Vec::with_capacity(pat.len() / 2); let mut pat_end = pat; match parse_helper(&mut pat_end, &mut result) { Ok(()) => Ok(result), Err(kind) => { let position = pat_end.as_ptr() as usize - pat.as_ptr() as usize; Err(ParsePatError { kind, position }) }, } } // This is preferable but currently limited by macro rules... // pub use crate::pattern as parse; fn parse_helper(pat: &mut &str, result: &mut Vec) -> Result<(), PatError> { result.push(Atom::Save(0)); let mut iter = pat.as_bytes().iter(); let mut save = 1; let mut depth = 0; #[derive(Default)] struct SubPattern { case: usize, brks: Vec, save: u8, save_next: u8, depth: u8, } let mut subs = Vec::::new(); while let Some(mut chr) = iter.next().cloned() { match chr { // Follow signed 1 byte jump b'%' => result.push(Atom::Jump1), // Follow signed 4 byte jump b'$' => result.push(Atom::Jump4), // Follow pointer b'*' => result.push(Atom::Ptr), // Start recursive operator b'{' => { depth += 1; // Must follow a jump operator and insert push before the jump let atom = match result.last_mut() { Some(atom @ Atom::Jump1) => mem::replace(atom, Atom::Push(1)), Some(atom @ Atom::Jump4) => mem::replace(atom, Atom::Push(4)), Some(atom @ Atom::Ptr) => mem::replace(atom, Atom::Push(PTR_SKIP)), _ => return Err(PatError::StackInvalid), }; result.push(atom); }, // End recursive operator b'}' => { // Unbalanced recursion if depth <= 0 { return Err(PatError::StackError); } depth -= 1; result.push(Atom::Pop); }, // Start subpattern b'(' => { subs.push(SubPattern::default()); let sub = subs.last_mut().unwrap(); // Keep the save and depth state sub.save = save; sub.depth = depth; // Add a new case, update the case offset later sub.case = result.len(); result.push(Atom::Case(0)); }, // Case subpattern b'|' => { // Should already have started a subpattern let sub = subs.last_mut().ok_or(PatError::SubPattern)?; // Update the save state sub.save_next = cmp::max(sub.save_next, save); save = sub.save; depth = sub.depth; // Add a break of the previous subpattern sub.brks.push(result.len()); result.push(Atom::Break(0)); // Add a new case of the next subpattern let case_offset = result.len() - sub.case - 1; if case_offset >= 256 { return Err(PatError::SubOverflow); } result[sub.case] = Atom::Case(case_offset as u8); sub.case = result.len(); result.push(Atom::Case(0)); }, // End subpattern b')' => { // Should already have started a subpattern let sub = subs.pop().ok_or(PatError::SubPattern)?; // Prepare for the next save save = cmp::max(sub.save_next, save); depth = sub.depth; // Neutralize the last case, since there are no more result[sub.case] = Atom::Nop; // Fill in the breaks for &brk in &sub.brks { let brk_offset = result.len() - brk - 1; if brk_offset >= 256 { return Err(PatError::SubOverflow); } result[brk] = Atom::Break(brk_offset as u8); } }, // Skip many operator b'[' => { // Parse the lower bound let mut lower_bound = 0u32; let mut at_least_one_char = false; loop { chr = iter.next().cloned().ok_or(PatError::ManyInvalid)?; match chr { b'-' | b']' => break, chr @ b'0'..=b'9' => { at_least_one_char = true; lower_bound = lower_bound * 10 + (chr - b'0') as u32; if lower_bound >= 16384 { return Err(PatError::ManyOverflow); } }, _ => return Err(PatError::ManyInvalid), } } if !at_least_one_char { return Err(PatError::ManyInvalid); } // Turn the lower bound into skip ops if lower_bound > 0 { if lower_bound >= 256 { result.push(Atom::Rangext((lower_bound >> 8) as u8)); } result.push(Atom::Skip((lower_bound & 0xff) as u8)); } // Second many part is optional if chr == b']' { continue; } // Parse the upper bound let mut upper_bound = 0u32; loop { chr = iter.next().cloned().ok_or(PatError::ManyInvalid)?; match chr { b']' => break, chr @ b'0'..=b'9' => { upper_bound = upper_bound * 10 + (chr - b'0') as u32; if upper_bound >= 16384 { return Err(PatError::ManyOverflow); } }, _ => return Err(PatError::ManyInvalid), } } // Lower bound should be strictly less than the upper bound if lower_bound < upper_bound { let many_skip = upper_bound - lower_bound; if many_skip >= 256 { result.push(Atom::Rangext((many_skip >> 8) as u8)); } result.push(Atom::Many((many_skip & 0xff) as u8)); } else { return Err(PatError::ManyRange); } }, // Match a byte b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' | b'.' => { let mut mask = 0xFF; // High nibble of the byte let hi = if chr == b'.' { mask &= 0x0F;0 } else if chr >= b'a' { chr - b'a' + 10 } else if chr >= b'A' { chr - b'A' + 10 } else { chr - b'0' }; chr = iter.next().cloned().ok_or(PatError::UnpairedHexDigit)?; // Low nibble of the byte let lo = if chr >= b'a' && chr <= b'f' { chr - b'a' + 10 } else if chr >= b'A' && chr <= b'F' { chr - b'A' + 10 } else if chr >= b'0' && chr <= b'9' { chr - b'0' } else if chr == b'.' { mask &= 0xF0; 0 } else { return Err(PatError::UnpairedHexDigit); }; if mask == 0 { return Err(PatError::DoubleNibble); }; // mask out nibble if mask != 0xFF { result.push(Atom::Fuzzy(mask)) } // Add byte to the pattern result.push(Atom::Byte((hi << 4) + lo)); }, // Match raw bytes b'"' => { loop { if let Some(chr) = iter.next().cloned() { if chr != b'"' { result.push(Atom::Byte(chr)); } else { break; } } else { return Err(PatError::UnclosedQuote); } } }, // Save the cursor b'\'' => { // 'Limited' save space if save >= u8::MAX { return Err(PatError::SaveOverflow); } result.push(Atom::Save(save)); save += 1; }, // Skip bytes b'?' => { // match result.last_mut() { // Some(Atom::Skip(skip)) if *skip != PTR_SKIP && *skip < 127i8 => *skip += 1, // _ => result.push(Atom::Skip(1)), // }; // Coalescence skips together if let Some(Atom::Skip(skip)) = result.last_mut() { if *skip != PTR_SKIP && *skip < 255u8 { *skip += 1; continue; } } result.push(Atom::Skip(1)); }, b'=' => { let op = iter.next().cloned().ok_or(PatError::CheckOperand)?; result.push( match op { b'0'..=b'9' => Atom::Check(op - b'0'), b'A'..=b'Z' => Atom::Check(10 + (op - b'A')), b'a'..=b'z' => Atom::Check(10 + (op - b'a')), _ => return Err(PatError::CheckOperand) }); }, b'@' => { let op = iter.next().cloned().ok_or(PatError::AlignedOperand)?; result.push( match op { b'0'..=b'9' => Atom::Aligned(op - b'0'), b'A'..=b'Z' => Atom::Aligned(10 + (op - b'A')), b'a'..=b'z' => Atom::Aligned(10 + (op - b'a')), _ => return Err(PatError::AlignedOperand) }); }, b'i' => { let atom = match iter.next().cloned() { Some(b'1') => Atom::ReadI8(save), Some(b'2') => Atom::ReadI16(save), Some(b'4') => Atom::ReadI32(save), _ => return Err(PatError::ReadOperand), }; if save >= u8::MAX { return Err(PatError::SaveOverflow); } save += 1; result.push(atom); }, b'u' => { let atom = match iter.next().cloned() { Some(b'1') => Atom::ReadU8(save), Some(b'2') => Atom::ReadU16(save), Some(b'4') => Atom::ReadU32(save), _ => return Err(PatError::ReadOperand), }; if save >= u8::MAX { return Err(PatError::SaveOverflow); } save += 1; result.push(atom); }, b'z' => { if save >= u8::MAX { return Err(PatError::SaveOverflow); } result.push(Atom::Zero(save)); save += 1; }, // Allow spaces as padding b' ' | b'\n' | b'\r' | b'\t' => {}, // Everything else is illegal _ => { return Err(PatError::UnknownChar); }, } // Converted from str originally, should be safe *pat = unsafe { str::from_utf8_unchecked(iter.as_slice()) }; } // Check balanced stack operators if depth != 0 { return Err(PatError::StackError); } // Check if sub patterns are balanced if subs.len() != 0 { return Err(PatError::SubPattern); } // Remove redundant atoms at the end fn is_redundant(atom: &Atom) -> bool { match atom { | Atom::Skip(_) | Atom::Rangext(_) | Atom::Pop | Atom::Many(_) => true, _ => false, } } while result.last().map(is_redundant).unwrap_or(false) { result.pop(); } Ok(()) }