From 7e03af23de7a193bb873ffaa6c3cdeb183b48566 Mon Sep 17 00:00:00 2001 From: Alexandr Mansurov Date: Thu, 22 Jan 2026 10:09:52 +0100 Subject: [PATCH] Improved memory and disk usage, best practices. Review done by GPT 5.2, code written by Claude Opus 4.5 --- src/db.rs | 109 +++++++++++---- src/files.rs | 51 +++++-- src/main.rs | 106 +++++++++----- src/parser.rs | 372 ++++++++++++++++++++++++++++++++++---------------- 4 files changed, 445 insertions(+), 193 deletions(-) diff --git a/src/db.rs b/src/db.rs index 8d40d18..704d7b0 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,5 +1,5 @@ use anyhow::Result; -use rusqlite::{params, Connection, Transaction}; +use rusqlite::{Connection, Transaction, params}; use std::collections::HashMap; use crate::parser::SignatureEntry; @@ -12,8 +12,17 @@ impl Database { pub fn new(path: &str) -> Result { let conn = Connection::open(path)?; - // Enable WAL mode for better concurrent read/write performance + // Production-optimized pragmas for bulk ingestion + // WAL mode for better concurrent read/write performance conn.pragma_update(None, "journal_mode", "WAL")?; + // NORMAL synchronous is acceptable for ingestion pipelines (fsync on checkpoint only) + conn.pragma_update(None, "synchronous", "NORMAL")?; + // Bound WAL growth: checkpoint every 1000 pages (~4MB with default page size) + conn.pragma_update(None, "wal_autocheckpoint", 1000)?; + // Small bounded cache to limit memory usage (negative = KB) + conn.pragma_update(None, "cache_size", -8000)?; // 8MB cache + // Store temp tables in memory to reduce disk I/O + conn.pragma_update(None, "temp_store", "MEMORY")?; let db = Self { conn }; db.init_schema()?; @@ -21,6 +30,7 @@ impl Database { } fn init_schema(&self) -> Result<()> { + // Create tables without indexes - indexes will be created after ingestion self.conn.execute_batch( r#" -- Lookup tables for low-cardinality text columns @@ -55,6 +65,7 @@ impl Database { ); -- Main table with normalized foreign keys and integer timestamp + -- Indexes are created AFTER ingestion for better bulk insert performance CREATE TABLE IF NOT EXISTS signature_entries ( id INTEGER PRIMARY KEY, session_id TEXT NOT NULL, @@ -72,12 +83,32 @@ impl Database { device_id INTEGER REFERENCES devices(id), password_autofill_usage INTEGER ); + "#, + )?; + Ok(()) + } + /// Create indexes and optimize database after ingestion is complete + pub fn finalize(&self) -> Result<()> { + eprintln!("Creating indexes..."); + self.conn.execute_batch( + r#" CREATE INDEX IF NOT EXISTS idx_session_id ON signature_entries(session_id); CREATE INDEX IF NOT EXISTS idx_timestamp ON signature_entries(timestamp_ms); CREATE INDEX IF NOT EXISTS idx_version ON signature_entries(version_id); "#, )?; + + // Run optimizer to analyze tables and update statistics + // Checkpoint and truncate WAL to reduce disk usage + eprintln!("Optimizing database and Checkpointing WAL..."); + self.conn.execute_batch( + r#" + PRAGMA optimize; + PRAGMA wal_checkpoint(TRUNCATE); + "#, + )?; + Ok(()) } @@ -94,6 +125,26 @@ impl Database { let mut os_cache: HashMap = HashMap::new(); let mut app_name_cache: HashMap = HashMap::new(); + // Prepare all lookup statements once (using INSERT ... ON CONFLICT ... RETURNING) + let mut apps_stmt = tx.prepare_cached( + "INSERT INTO apps (name) VALUES (?) ON CONFLICT(name) DO UPDATE SET name=excluded.name RETURNING id", + )?; + let mut versions_stmt = tx.prepare_cached( + "INSERT INTO versions (name) VALUES (?) ON CONFLICT(name) DO UPDATE SET name=excluded.name RETURNING id", + )?; + let mut models_stmt = tx.prepare_cached( + "INSERT INTO models (name) VALUES (?) ON CONFLICT(name) DO UPDATE SET name=excluded.name RETURNING id", + )?; + let mut devices_stmt = tx.prepare_cached( + "INSERT INTO devices (name) VALUES (?) ON CONFLICT(name) DO UPDATE SET name=excluded.name RETURNING id", + )?; + let mut os_stmt = tx.prepare_cached( + "INSERT INTO os_versions (name) VALUES (?) ON CONFLICT(name) DO UPDATE SET name=excluded.name RETURNING id", + )?; + let mut app_names_stmt = tx.prepare_cached( + "INSERT INTO app_names (name) VALUES (?) ON CONFLICT(name) DO UPDATE SET name=excluded.name RETURNING id", + )?; + let mut insert_stmt = tx.prepare_cached( r#" INSERT INTO signature_entries ( @@ -106,12 +157,29 @@ impl Database { )?; for entry in entries { - let app_id = get_or_insert_lookup(tx, &mut app_cache, "apps", &entry.app)?; - let version_id = get_or_insert_lookup(tx, &mut version_cache, "versions", &entry.version)?; - let model_id = entry.model.as_ref().map(|v| get_or_insert_lookup(tx, &mut model_cache, "models", v)).transpose()?; - let device_id = entry.device.as_ref().map(|v| get_or_insert_lookup(tx, &mut device_cache, "devices", v)).transpose()?; - let os_id = entry.os.as_ref().map(|v| get_or_insert_lookup(tx, &mut os_cache, "os_versions", v)).transpose()?; - let app_name_id = entry.app_name.as_ref().map(|v| get_or_insert_lookup(tx, &mut app_name_cache, "app_names", v)).transpose()?; + let app_id = get_or_insert_cached(&mut apps_stmt, &mut app_cache, &entry.app)?; + let version_id = + get_or_insert_cached(&mut versions_stmt, &mut version_cache, &entry.version)?; + let model_id = entry + .model + .as_ref() + .map(|v| get_or_insert_cached(&mut models_stmt, &mut model_cache, v)) + .transpose()?; + let device_id = entry + .device + .as_ref() + .map(|v| get_or_insert_cached(&mut devices_stmt, &mut device_cache, v)) + .transpose()?; + let os_id = entry + .os + .as_ref() + .map(|v| get_or_insert_cached(&mut os_stmt, &mut os_cache, v)) + .transpose()?; + let app_name_id = entry + .app_name + .as_ref() + .map(|v| get_or_insert_cached(&mut app_names_stmt, &mut app_name_cache, v)) + .transpose()?; insert_stmt.execute(params![ entry.session_id, @@ -135,32 +203,19 @@ impl Database { } } -/// Get or insert a value into a lookup table, using a cache to minimize DB queries -fn get_or_insert_lookup( - tx: &Transaction<'_>, +/// Get or insert a value using a prepared statement with RETURNING, with in-memory cache +fn get_or_insert_cached( + stmt: &mut rusqlite::CachedStatement<'_>, cache: &mut HashMap, - table: &str, value: &str, ) -> Result { + // Check cache first if let Some(&id) = cache.get(value) { return Ok(id); } - // Try to find existing entry - let query = format!("SELECT id FROM {} WHERE name = ?", table); - let existing: Option = tx - .query_row(&query, params![value], |row| row.get(0)) - .ok(); - - if let Some(id) = existing { - cache.insert(value.to_string(), id); - return Ok(id); - } - - // Insert new entry - let insert = format!("INSERT INTO {} (name) VALUES (?)", table); - tx.execute(&insert, params![value])?; - let id = tx.last_insert_rowid(); + // Use INSERT ... ON CONFLICT ... RETURNING to get id in one round-trip + let id: i64 = stmt.query_row(params![value], |row| row.get(0))?; cache.insert(value.to_string(), id); Ok(id) } diff --git a/src/files.rs b/src/files.rs index 8f447ec..3a19902 100644 --- a/src/files.rs +++ b/src/files.rs @@ -1,10 +1,41 @@ -use anyhow::{anyhow, Result}; +use anyhow::{Result, anyhow}; use chrono::NaiveDate; use flate2::read::GzDecoder; use std::fs::File; -use std::io::{BufRead, BufReader}; +use std::io::{BufRead, BufReader, Read}; use std::path::PathBuf; +/// Enum-based reader to avoid Box heap allocation and dynamic dispatch +pub enum LogReader { + Plain(BufReader), + Gzip(BufReader>), +} + +impl Read for LogReader { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + match self { + LogReader::Plain(r) => r.read(buf), + LogReader::Gzip(r) => r.read(buf), + } + } +} + +impl BufRead for LogReader { + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { + match self { + LogReader::Plain(r) => r.fill_buf(), + LogReader::Gzip(r) => r.fill_buf(), + } + } + + fn consume(&mut self, amt: usize) { + match self { + LogReader::Plain(r) => r.consume(amt), + LogReader::Gzip(r) => r.consume(amt), + } + } +} + /// Discovers log files for a given date range pub struct LogFileDiscovery { base_dir: PathBuf, @@ -25,9 +56,7 @@ impl LogFileDiscovery { if let Some(log_file) = self.find_log_for_date(current)? { files.push(log_file); } - current = current - .succ_opt() - .ok_or_else(|| anyhow!("Date overflow"))?; + current = current.succ_opt().ok_or_else(|| anyhow!("Date overflow"))?; } Ok(files) @@ -72,26 +101,26 @@ pub struct LogFile { impl LogFile { /// Returns a buffered reader for this log file, handling compression transparently - pub fn reader(&self) -> Result> { + pub fn reader(&self) -> Result { let file = File::open(&self.path)?; if self.compressed { let decoder = GzDecoder::new(file); - Ok(Box::new(BufReader::new(decoder))) + Ok(LogReader::Gzip(BufReader::new(decoder))) } else { - Ok(Box::new(BufReader::new(file))) + Ok(LogReader::Plain(BufReader::new(file))) } } } /// For reading a single file directly (e.g., for testing) -pub fn read_log_file(path: &str) -> Result> { +pub fn read_log_file(path: &str) -> Result { let file = File::open(path)?; if path.ends_with(".gz") { let decoder = GzDecoder::new(file); - Ok(Box::new(BufReader::new(decoder))) + Ok(LogReader::Gzip(BufReader::new(decoder))) } else { - Ok(Box::new(BufReader::new(file))) + Ok(LogReader::Plain(BufReader::new(file))) } } diff --git a/src/main.rs b/src/main.rs index 5cfe42f..9eb1411 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,13 +1,13 @@ -use anyhow::{anyhow, Result}; +use anyhow::{Result, anyhow}; use chrono::NaiveDate; use clap::Parser; -use crossbeam_channel::{bounded, Sender}; +use crossbeam_channel::{Sender, bounded}; use rayon::prelude::*; use std::collections::HashMap; use std::io::BufRead; use std::path::PathBuf; -use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use std::thread; mod db; @@ -15,7 +15,7 @@ mod files; mod parser; use db::Database; -use files::{read_log_file, LogFile, LogFileDiscovery}; +use files::{LogFile, LogFileDiscovery, LogReader, read_log_file}; use parser::{ParsedMessage, ParserRegistry, SignatureEntry}; #[derive(Parser, Debug)] @@ -63,11 +63,15 @@ fn main() -> Result<()> { let args = Args::parse(); // Configure rayon thread pool if threads specified - if args.threads > 0 { - rayon::ThreadPoolBuilder::new() + if args.threads > 0 + && let Err(e) = rayon::ThreadPoolBuilder::new() .num_threads(args.threads) .build_global() - .ok(); // Ignore error if pool already initialized + { + eprintln!( + "Warning: Could not configure thread pool ({}), using default", + e + ); } let use_parallel = args.threads != 1; @@ -79,6 +83,7 @@ fn main() -> Result<()> { eprintln!("Processing single file: {}", file_path.display()); let reader = read_log_file(file_path.to_str().unwrap())?; process_reader(reader, ®istry, &mut db, args.batch_size)?; + db.finalize()?; } else { // Process date range let from = parse_date( @@ -142,12 +147,18 @@ fn process_files_sequential( let reader = log_file.reader()?; process_reader(reader, ®istry, &mut db, batch_size)?; } + + db.finalize()?; Ok(()) } fn process_files_parallel(log_files: Vec, output: &str, batch_size: usize) -> Result<()> { let num_threads = rayon::current_num_threads(); - eprintln!("Processing {} files with {} threads", log_files.len(), num_threads); + eprintln!( + "Processing {} files with {} threads", + log_files.len(), + num_threads + ); // Channel for sending parsed entries to the DB writer // Buffer size: enough batches to keep workers busy without excessive memory @@ -158,6 +169,9 @@ fn process_files_parallel(log_files: Vec, output: &str, batch_size: usi let parsed_lines = Arc::new(AtomicU64::new(0)); let error_count = Arc::new(AtomicU64::new(0)); + // Shared parser registry - parsers are stateless, so we can share one instance + let registry = Arc::new(ParserRegistry::new()); + // Spawn DB writer thread let output_path = output.to_string(); let db_handle = thread::spawn(move || -> Result<()> { @@ -169,29 +183,33 @@ fn process_files_parallel(log_files: Vec, output: &str, batch_size: usi tx.commit()?; } + db.finalize()?; Ok(()) }); // Process files in parallel - let result: Result<()> = log_files - .into_par_iter() - .try_for_each(|log_file| { - let file_path = log_file.path.display().to_string(); - let compressed = if log_file.compressed { "compressed" } else { "plain" }; - eprintln!("Starting: {} ({})", file_path, compressed); + let result: Result<()> = log_files.into_par_iter().try_for_each(|log_file| { + let file_path = log_file.path.display().to_string(); + let compressed = if log_file.compressed { + "compressed" + } else { + "plain" + }; + eprintln!("Starting: {} ({})", file_path, compressed); - process_file_parallel( - log_file, - &sender, - batch_size, - &total_lines, - &parsed_lines, - &error_count, - )?; + process_file_parallel( + log_file, + ®istry, + &sender, + batch_size, + &total_lines, + &parsed_lines, + &error_count, + )?; - eprintln!("Finished: {}", file_path); - Ok(()) - }); + eprintln!("Finished: {}", file_path); + Ok(()) + }); // Close the channel so DB writer knows to stop drop(sender); @@ -214,25 +232,35 @@ fn process_files_parallel(log_files: Vec, output: &str, batch_size: usi fn process_file_parallel( log_file: LogFile, + registry: &Arc, sender: &Sender>, batch_size: usize, total_lines: &AtomicU64, parsed_lines: &AtomicU64, error_count: &AtomicU64, ) -> Result<()> { - let registry = ParserRegistry::new(); - let reader = log_file.reader()?; + let mut reader = log_file.reader()?; let mut batch: Vec = Vec::with_capacity(batch_size); let mut file_lines = 0u64; let mut file_parsed = 0u64; let mut file_errors = 0u64; - for line_result in reader.lines() { - let line = line_result?; + // Reuse line buffer to avoid per-line allocations + let mut line = String::new(); + + loop { + line.clear(); + let bytes_read = reader.read_line(&mut line)?; + if bytes_read == 0 { + break; // EOF + } file_lines += 1; - if let Some(parse_result) = registry.parse(&line) { + // Trim newline for parsing (without reallocating) + let line_trimmed = line.trim_end(); + + if let Some(parse_result) = registry.parse(line_trimmed) { match parse_result { Ok(ParsedMessage::Signature(entry)) => { batch.push(entry); @@ -266,7 +294,7 @@ fn process_file_parallel( } fn process_reader( - reader: Box, + mut reader: LogReader, registry: &ParserRegistry, db: &mut Database, batch_size: usize, @@ -276,11 +304,21 @@ fn process_reader( let mut parsed_lines = 0u64; let mut error_counts: HashMap = HashMap::new(); - for line_result in reader.lines() { - let line = line_result?; + // Reuse line buffer to avoid per-line allocations + let mut line = String::new(); + + loop { + line.clear(); + let bytes_read = reader.read_line(&mut line)?; + if bytes_read == 0 { + break; // EOF + } total_lines += 1; - if let Some(parse_result) = registry.parse(&line) { + // Trim newline for parsing (without reallocating) + let line_trimmed = line.trim_end(); + + if let Some(parse_result) = registry.parse(line_trimmed) { match parse_result { Ok(ParsedMessage::Signature(entry)) => { signature_batch.push(entry); diff --git a/src/parser.rs b/src/parser.rs index 584b6dd..709cf52 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, Result}; +use anyhow::{Result, anyhow}; use chrono::NaiveDateTime; use regex::Regex; use std::sync::LazyLock; @@ -23,6 +23,45 @@ pub struct SignatureEntry { pub password_autofill_usage: Option, } +/// Parsed details without HashMap allocation - holds string slices into the original details string +struct ParsedDetails<'a> { + offline_login_usage: Option<&'a str>, + is_password_autofill_enabled: Option<&'a str>, + camera_roll_usage: Option<&'a str>, + os: Option<&'a str>, + app_name: Option<&'a str>, + touch_id: Option<&'a str>, + is_offline_login_enabled: Option<&'a str>, + model: Option<&'a str>, + device: Option<&'a str>, + password_autofill_usage: Option<&'a str>, +} + +impl<'a> ParsedDetails<'a> { + fn new() -> Self { + Self { + offline_login_usage: None, + is_password_autofill_enabled: None, + camera_roll_usage: None, + os: None, + app_name: None, + touch_id: None, + is_offline_login_enabled: None, + model: None, + device: None, + password_autofill_usage: None, + } + } +} + +/// Parsed mobile details without HashMap allocation +struct ParsedMobileDetails<'a> { + os: Option<&'a str>, + app_name: Option<&'a str>, + model: Option<&'a str>, + device: Option<&'a str>, +} + /// Trait for parsing different message types from logs. /// Implement this trait to add support for new message formats. pub trait MessageParser: Send + Sync { @@ -39,8 +78,9 @@ pub enum ParsedMessage { static SESSION_ID_RE: LazyLock = LazyLock::new(|| Regex::new(r"sessionId=([^,\s]+)").unwrap()); -static DATETIME_RE: LazyLock = - LazyLock::new(|| Regex::new(r#"dt="(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})(?:,(\d{3}))?"#).unwrap()); +static DATETIME_RE: LazyLock = LazyLock::new(|| { + Regex::new(r#"dt="(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})(?:,(\d{3}))?"#).unwrap() +}); static CORRELATION_ID_RE: LazyLock = LazyLock::new(|| Regex::new(r"correlationId=([^,\s]+)").unwrap()); static SIGNATURE_RE: LazyLock = @@ -100,25 +140,26 @@ impl SignatureParser { let version = caps.get(2).map(|m| m.as_str().to_string()).unwrap(); let details_str = caps.get(3).map(|m| m.as_str()).unwrap(); - // Parse details key-value pairs - // Handle the tricky "device:iOS, Apple" case by parsing carefully - let details = parse_details(details_str)?; + // Parse details key-value pairs directly into struct (no HashMap) + let details = parse_details_direct(details_str); let entry = SignatureEntry { session_id, timestamp_ms, app, version, - offline_login_usage: parse_number(&details, "offlineLoginUsage"), - is_password_autofill_enabled: parse_bool(&details, "isPasswordAutofillEnabled"), - camera_roll_usage: parse_number(&details, "cameraRollUsage"), - os: get_string(&details, "OS"), - app_name: get_string(&details, "appName"), - touch_id: parse_bool(&details, "touchID"), - is_offline_login_enabled: parse_bool(&details, "isOfflineLoginEnabled"), - model: get_string(&details, "model"), - device: get_string(&details, "device"), - password_autofill_usage: parse_number(&details, "passwordAutofillUsage"), + offline_login_usage: details.offline_login_usage.and_then(parse_number_str), + is_password_autofill_enabled: details + .is_password_autofill_enabled + .and_then(parse_bool_str), + camera_roll_usage: details.camera_roll_usage.and_then(parse_number_str), + os: details.os.map(|s| s.to_string()), + app_name: details.app_name.map(|s| s.to_string()), + touch_id: details.touch_id.and_then(parse_bool_str), + is_offline_login_enabled: details.is_offline_login_enabled.and_then(parse_bool_str), + model: details.model.map(|s| s.to_string()), + device: details.device.map(|s| s.to_string()), + password_autofill_usage: details.password_autofill_usage.and_then(parse_number_str), }; Ok(ParsedMessage::Signature(entry)) @@ -151,7 +192,7 @@ impl MobileClientIosParser { let version = caps.get(2).map(|m| m.as_str().to_string()).unwrap(); let details_str = caps.get(4).map(|m| m.as_str()).unwrap(); - let details = parse_mobile_details(details_str); + let details = parse_mobile_details_direct(details_str); let entry = SignatureEntry { session_id, @@ -161,11 +202,11 @@ impl MobileClientIosParser { offline_login_usage: None, is_password_autofill_enabled: None, camera_roll_usage: None, - os: get_string(&details, "os"), - app_name: get_string(&details, "app-name"), + os: details.os.map(|s| s.to_string()), + app_name: details.app_name.map(|s| s.to_string()), touch_id: None, is_offline_login_enabled: None, - model: get_string(&details, "model"), + model: details.model.map(|s| s.to_string()), device: Some("iOS".to_string()), password_autofill_usage: None, }; @@ -200,7 +241,7 @@ impl MobileClientAndroidParser { let version = caps.get(2).map(|m| m.as_str().to_string()).unwrap(); let details_str = caps.get(4).map(|m| m.as_str()).unwrap(); - let details = parse_mobile_details_android(details_str); + let details = parse_mobile_details_android_direct(details_str); let entry = SignatureEntry { session_id, @@ -210,12 +251,12 @@ impl MobileClientAndroidParser { offline_login_usage: None, is_password_autofill_enabled: None, camera_roll_usage: None, - os: get_string(&details, "os"), + os: details.os.map(|s| s.to_string()), app_name: Some("native Android".to_string()), touch_id: None, is_offline_login_enabled: None, - model: get_string(&details, "model"), - device: get_string(&details, "device"), + model: details.model.map(|s| s.to_string()), + device: details.device.map(|s| s.to_string()), password_autofill_usage: None, }; @@ -250,93 +291,95 @@ fn extract_correlation_id(line: &str) -> Result { .ok_or_else(|| anyhow!("Missing correlationId")) } -/// Parse mobile client details for iOS (simple comma-separated key:value) -fn parse_mobile_details(details: &str) -> std::collections::HashMap { - let mut map = std::collections::HashMap::new(); - +/// Parse mobile client details for iOS directly into struct (no HashMap allocation) +fn parse_mobile_details_direct(details: &str) -> ParsedMobileDetails<'_> { // Keys for iOS mobile client - let known_keys = ["sdk-client", "sdk-version", "app-name", "device", "model", "os"]; + const KNOWN_KEYS: [&str; 6] = [ + "sdk-client", + "sdk-version", + "app-name", + "device", + "model", + "os", + ]; - let mut key_positions: Vec<(usize, &str)> = known_keys + let mut key_positions: Vec<(usize, &str)> = KNOWN_KEYS .iter() - .filter_map(|&key| { - let pattern = format!("{}:", key); - details.find(&pattern).map(|pos| (pos, key)) - }) + .filter_map(|&key| find_key_position(details, key)) .collect(); key_positions.sort_by_key(|&(pos, _)| pos); + let mut result = ParsedMobileDetails { + os: None, + app_name: None, + model: None, + device: None, + }; + for i in 0..key_positions.len() { let (pos, key) = key_positions[i]; - let value_start = pos + key.len() + 1; + let value = extract_value(details, pos, key, i, &key_positions); - let value_end = if i + 1 < key_positions.len() { - let next_pos = key_positions[i + 1].0; - if next_pos > 0 && details.as_bytes().get(next_pos - 1) == Some(&b',') { - next_pos - 1 - } else { - next_pos - } - } else { - details.find(" user-agent").unwrap_or(details.len()) - }; - - let value = details[value_start..value_end].trim().to_string(); - map.insert(key.to_string(), value); + match key { + "os" => result.os = Some(value), + "app-name" => result.app_name = Some(value), + "model" => result.model = Some(value), + "device" => result.device = Some(value), + _ => {} + } } - map + result } -/// Parse mobile client details for Android (handles device with commas) -fn parse_mobile_details_android(details: &str) -> std::collections::HashMap { - let mut map = std::collections::HashMap::new(); - +/// Parse mobile client details for Android directly into struct (no HashMap allocation) +fn parse_mobile_details_android_direct(details: &str) -> ParsedMobileDetails<'_> { // For Android, device can contain commas like "Android, samsung" - // Keys in order: sdk-client, sdk-version, app-name, device, model, os - let known_keys = ["sdk-client", "sdk-version", "app-name", "device", "model", "os"]; + const KNOWN_KEYS: [&str; 6] = [ + "sdk-client", + "sdk-version", + "app-name", + "device", + "model", + "os", + ]; - let mut key_positions: Vec<(usize, &str)> = known_keys + let mut key_positions: Vec<(usize, &str)> = KNOWN_KEYS .iter() - .filter_map(|&key| { - let pattern = format!("{}:", key); - details.find(&pattern).map(|pos| (pos, key)) - }) + .filter_map(|&key| find_key_position(details, key)) .collect(); key_positions.sort_by_key(|&(pos, _)| pos); + let mut result = ParsedMobileDetails { + os: None, + app_name: None, + model: None, + device: None, + }; + for i in 0..key_positions.len() { let (pos, key) = key_positions[i]; - let value_start = pos + key.len() + 1; + let value = extract_value(details, pos, key, i, &key_positions); - let value_end = if i + 1 < key_positions.len() { - let next_pos = key_positions[i + 1].0; - // Find the comma before the next key - if next_pos > 0 && details.as_bytes().get(next_pos - 1) == Some(&b',') { - next_pos - 1 - } else { - next_pos - } - } else { - details.find(" user-agent").unwrap_or(details.len()) - }; - - let value = details[value_start..value_end].trim().to_string(); - map.insert(key.to_string(), value); + match key { + "os" => result.os = Some(value), + "app-name" => result.app_name = Some(value), + "model" => result.model = Some(value), + "device" => result.device = Some(value), + _ => {} + } } - map + result } -/// Parse the details string which has format like: -/// offlineLoginUsage:0,isPasswordAutofillEnabled:no,...,device:iOS, Apple,passwordAutofillUsage:0 -fn parse_details(details: &str) -> Result> { - let mut map = std::collections::HashMap::new(); - +/// Parse the details string directly into ParsedDetails (no HashMap allocation) +/// Format: offlineLoginUsage:0,isPasswordAutofillEnabled:no,...,device:iOS, Apple,passwordAutofillUsage:0 +fn parse_details_direct(details: &str) -> ParsedDetails<'_> { // Known keys in order they appear - let known_keys = [ + const KNOWN_KEYS: [&str; 10] = [ "offlineLoginUsage", "isPasswordAutofillEnabled", "cameraRollUsage", @@ -350,61 +393,147 @@ fn parse_details(details: &str) -> Result = known_keys + let mut key_positions: Vec<(usize, &str)> = KNOWN_KEYS .iter() - .filter_map(|&key| { - let pattern = format!("{}:", key); - details.find(&pattern).map(|pos| (pos, key)) - }) + .filter_map(|&key| find_key_position(details, key)) .collect(); // Sort by position key_positions.sort_by_key(|&(pos, _)| pos); + let mut result = ParsedDetails::new(); + // Extract values between keys for i in 0..key_positions.len() { let (pos, key) = key_positions[i]; - let value_start = pos + key.len() + 1; // +1 for ':' + let value = extract_value(details, pos, key, i, &key_positions); - let value_end = if i + 1 < key_positions.len() { - // Value ends at the comma before the next key - let next_pos = key_positions[i + 1].0; - // Find the comma before the next key - if next_pos > 0 && details.as_bytes().get(next_pos - 1) == Some(&b',') { - next_pos - 1 - } else { - next_pos - } - } else { - // Last key - value goes until " user-agent" or end - details - .find(" user-agent") - .unwrap_or(details.len()) - }; - - let value = details[value_start..value_end].trim().to_string(); - map.insert(key.to_string(), value); + match key { + "offlineLoginUsage" => result.offline_login_usage = Some(value), + "isPasswordAutofillEnabled" => result.is_password_autofill_enabled = Some(value), + "cameraRollUsage" => result.camera_roll_usage = Some(value), + "OS" => result.os = Some(value), + "appName" => result.app_name = Some(value), + "touchID" => result.touch_id = Some(value), + "isOfflineLoginEnabled" => result.is_offline_login_enabled = Some(value), + "model" => result.model = Some(value), + "device" => result.device = Some(value), + "passwordAutofillUsage" => result.password_autofill_usage = Some(value), + _ => {} + } } - Ok(map) + result } -fn parse_number(map: &std::collections::HashMap, key: &str) -> Option { - map.get(key).and_then(|v| v.parse().ok()) -} - -fn parse_bool(map: &std::collections::HashMap, key: &str) -> Option { - map.get(key).and_then(|value| { - match value.to_lowercase().as_str() { - "yes" | "true" | "1" => Some(true), - "no" | "false" | "0" => Some(false), - _ => None, +/// Find the position of a key in the details string without allocating +#[inline] +fn find_key_position<'a>(details: &str, key: &'a str) -> Option<(usize, &'a str)> { + // Search for "key:" pattern + let mut search_start = 0; + while let Some(pos) = details[search_start..].find(key) { + let absolute_pos = search_start + pos; + // Check if followed by ':' + if details.as_bytes().get(absolute_pos + key.len()) == Some(&b':') { + // Check if at start or preceded by comma + if absolute_pos == 0 || details.as_bytes().get(absolute_pos - 1) == Some(&b',') { + return Some((absolute_pos, key)); + } } - }) + search_start = absolute_pos + 1; + } + None } -fn get_string(map: &std::collections::HashMap, key: &str) -> Option { - map.get(key).map(|s| s.to_string()) +/// Extract a value from the details string without allocating +#[inline] +fn extract_value<'a>( + details: &'a str, + pos: usize, + key: &str, + index: usize, + key_positions: &[(usize, &str)], +) -> &'a str { + let value_start = pos + key.len() + 1; // +1 for ':' + + let value_end = if index + 1 < key_positions.len() { + let next_pos = key_positions[index + 1].0; + // Find the comma before the next key + if next_pos > 0 && details.as_bytes().get(next_pos - 1) == Some(&b',') { + next_pos - 1 + } else { + next_pos + } + } else { + // Last key - value goes until " user-agent" or end + details.find(" user-agent").unwrap_or(details.len()) + }; + + details[value_start..value_end].trim() +} + +/// Parse a number from a string slice without allocation +#[inline] +fn parse_number_str(s: &str) -> Option { + s.parse().ok() +} + +/// Parse a boolean from a string slice using ASCII-insensitive matching (no allocation) +#[inline] +fn parse_bool_str(value: &str) -> Option { + let bytes = value.as_bytes(); + match bytes.len() { + 1 => match bytes[0] { + b'1' => Some(true), + b'0' => Some(false), + _ => None, + }, + 2 => { + // "no" + if (bytes[0] == b'n' || bytes[0] == b'N') && (bytes[1] == b'o' || bytes[1] == b'O') { + Some(false) + } else { + None + } + } + 3 => { + // "yes" + if (bytes[0] == b'y' || bytes[0] == b'Y') + && (bytes[1] == b'e' || bytes[1] == b'E') + && (bytes[2] == b's' || bytes[2] == b'S') + { + Some(true) + } else { + None + } + } + 4 => { + // "true" + if (bytes[0] == b't' || bytes[0] == b'T') + && (bytes[1] == b'r' || bytes[1] == b'R') + && (bytes[2] == b'u' || bytes[2] == b'U') + && (bytes[3] == b'e' || bytes[3] == b'E') + { + Some(true) + } else { + None + } + } + 5 => { + // "false" + if (bytes[0] == b'f' || bytes[0] == b'F') + && (bytes[1] == b'a' || bytes[1] == b'A') + && (bytes[2] == b'l' || bytes[2] == b'L') + && (bytes[3] == b's' || bytes[3] == b'S') + && (bytes[4] == b'e' || bytes[4] == b'E') + { + Some(false) + } else { + None + } + } + _ => None, + } } /// Registry of all available message parsers @@ -477,7 +606,8 @@ mod tests { #[test] fn test_parse_non_signature_line() { - let line = r#"Jan 21 00:00:06 tom013 m1s-kv dt="2026-01-21 00:00:06", msg="some other message""#; + let line = + r#"Jan 21 00:00:06 tom013 m1s-kv dt="2026-01-21 00:00:06", msg="some other message""#; let registry = ParserRegistry::new(); assert!(registry.parse(line).is_none()); }