commit 169409738f778644f690ac90204f28f43a04c0d8 Author: Alexandr Mansurov Date: Wed Jan 21 22:34:48 2026 +0100 Add log ingestion tool for loading signature logs into SQLite - Parse signature messages from log files extracting app info, device details, and feature flags (autofill, touchID, offline login, etc.) - Support both plain .log and gzip compressed .log.gz files - File discovery by date range (YYYY/mm/dd directory structure) - Batch inserts for performance with large files (10GB+ per day) - Index on session_id and version for efficient queries - Extensible parser architecture via MessageParser trait - Parallel file processing for multi-day ingestion Co-Authored-By: Claude Opus 4.5 diff --git a/.github/dependabot.yaml b/.github/dependabot.yaml new file mode 100644 index 0000000..903f8af --- /dev/null +++ b/.github/dependabot.yaml @@ -0,0 +1,14 @@ +version: 2 +updates: + - package-ecosystem: github-actions + directory: / + schedule: + interval: weekly + time: "07:00" + timezone: "Asia/Tokyo" + - package-ecosystem: cargo + directory: / + schedule: + interval: weekly + time: "07:00" + timezone: "Asia/Tokyo" diff --git a/.github/workflows/audit.yaml b/.github/workflows/audit.yaml new file mode 100644 index 0000000..58597e6 --- /dev/null +++ b/.github/workflows/audit.yaml @@ -0,0 +1,31 @@ +name: Security audit + +on: + schedule: + - cron: "0 0 */3 * *" + + push: + branches: [main] + paths: + - "**/Cargo.toml" + - "**/Cargo.lock" + + pull_request: + paths: + - "**/Cargo.toml" + - "**/Cargo.lock" + +jobs: + audit: + name: Audit + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Install cargo-audit + uses: taiki-e/install-action@30eab0fabba9ea3f522099957e668b21876aa39e # v2.66.6 + with: + tool: cargo-audit + + - name: Run audit + run: cargo audit diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml new file mode 100644 index 0000000..a794c2c --- /dev/null +++ b/.github/workflows/benchmark.yaml @@ -0,0 +1,39 @@ +name: Benchmark +on: + push: + branches: + - main + paths: + - '**/*.rs' + - '**/Cargo.toml' + - '**/Cargo.lock' + - '.github/workflows/benchmark.yaml' + +permissions: + contents: write + deployments: write + +jobs: + benchmark: + name: Run Rust benchmark example + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + - name: Toolchain setup + run: rustup toolchain update nightly && rustup default nightly + - name: Run benchmark + run: cargo +nightly bench | tee output.txt + + - name: Store benchmark result + uses: benchmark-action/github-action-benchmark@4bdcce38c94cec68da58d012ac24b7b1155efe8b # v1.20.7 + with: + name: Rust Benchmark + tool: 'cargo' + output-file-path: output.txt + github-token: ${{ secrets.GITHUB_TOKEN }} + auto-push: true + # Show alert with commit comment on detecting possible performance regression + alert-threshold: '200%' + comment-on-alert: true + fail-on-alert: true + benchmark-data-dir-path: docs diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..019aa36 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,97 @@ +name: Rust CI + +on: + push: + branches: [main] + paths: + - '**/*.rs' + - '**/Cargo.toml' + - '**/Cargo.lock' + - '.github/workflows/ci.yaml' + + pull_request: + paths: + - '**/*.rs' + - '**/Cargo.toml' + - '**/Cargo.lock' + - '.github/workflows/ci.yaml' + +env: + CARGO_TERM_COLOR: always + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + check: + name: Check + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + fetch-depth: 0 + + - uses: actions-rust-lang/setup-rust-toolchain@1780873c7b576612439a134613cc4cc74ce5538c # v1.15.2 + with: + components: rustfmt, clippy + cache-shared-key: setup-rust-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/Cargo.lock') }} + + - name: Install reviewdog + uses: reviewdog/action-setup@d8a7baabd7f3e8544ee4dbde3ee41d0011c3a93f # v1.5.0 + + - name: Check format + run: | + cargo fmt --all -- --check + + - uses: giraffate/clippy-action@13b9d32482f25d29ead141b79e7e04e7900281e0 # v1.0.1 + with: + reporter: 'github-pr-review' + github_token: ${{ secrets.GITHUB_TOKEN }} + fail_on_error: true + filter_mode: nofilter + + - name: Build + run: cargo build + + test: + name: Test + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + runs-on: ${{ matrix.os }} + permissions: + contents: write + pull-requests: write + steps: + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + fetch-depth: 0 + + - uses: actions-rust-lang/setup-rust-toolchain@1780873c7b576612439a134613cc4cc74ce5538c # v1.15.2 + with: + components: llvm-tools-preview + cache-shared-key: setup-rust-${{ runner.os }}-${{ runner.arch }}-${{ hashFiles('**/Cargo.lock') }} + + - name: Install tools + uses: taiki-e/install-action@30eab0fabba9ea3f522099957e668b21876aa39e # v2.66.6 + with: + tool: cargo-llvm-cov, cargo-nextest + + - name: Run test + if: runner.os != 'Linux' + run: | + cargo nextest run + + - name: Generate coverage + if: runner.os == 'Linux' + run: cargo llvm-cov nextest --lcov --output-path lcov.info + + - name: Upload coverage + if: runner.os == 'Linux' + uses: k1LoW/octocov-action@73d561f65d59e66899ed5c87e4621a913b5d5c20 # v1.5.0 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/dependabot-auto-merge.yaml b/.github/workflows/dependabot-auto-merge.yaml new file mode 100644 index 0000000..1bd95e6 --- /dev/null +++ b/.github/workflows/dependabot-auto-merge.yaml @@ -0,0 +1,34 @@ +name: Dependabot Auto-merge + +on: + pull_request: + types: + - opened + - synchronize + - reopened + +permissions: + contents: write + pull-requests: write + +jobs: + dependabot-automation: + runs-on: ubuntu-latest + if: ${{ github.actor == 'dependabot[bot]' }} + timeout-minutes: 13 + steps: + - name: Dependabot metadata + id: metadata + uses: dependabot/fetch-metadata@21025c705c08248db411dc16f3619e6b5f9ea21a # v2.5.0 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + - name: Approve & enable auto-merge for Dependabot PR + if: | + steps.metadata.outputs.update-type == 'version-update:semver-patch' || + steps.metadata.outputs.update-type == 'version-update:semver-minor' + run: | + gh pr merge --auto -s "$PR_URL" + env: + PR_URL: ${{ github.event.pull_request.html_url }} + PR_TITLE: ${{ github.event.pull_request.title }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml new file mode 100644 index 0000000..d290d21 --- /dev/null +++ b/.github/workflows/release.yaml @@ -0,0 +1,134 @@ +name: Release + +on: + push: + tags: + - 'v*' + workflow_dispatch: + +permissions: + contents: write + +jobs: + build: + name: Build - ${{ matrix.target }} + strategy: + matrix: + include: + - target: x86_64-unknown-linux-gnu + runner: ubuntu-latest + os: Linux + arch: x86_64 + ext: tar.gz + - target: aarch64-unknown-linux-gnu + runner: ubuntu-24.04-arm + os: Linux + arch: arm64 + ext: tar.gz + - target: x86_64-apple-darwin + runner: macos-15-intel + os: Darwin + arch: x86_64 + ext: tar.gz + - target: aarch64-apple-darwin + runner: macos-latest + os: Darwin + arch: arm64 + ext: tar.gz + - target: x86_64-pc-windows-msvc + runner: windows-latest + os: Windows + arch: x86_64 + ext: zip + runs-on: ${{ matrix.runner }} + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + + - name: Setup sccache + uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad # v0.0.9 + + - name: Setup environment variables for sccache + shell: bash + run: | + echo "SCCACHE_GHA_ENABLED=true" >> "$GITHUB_ENV" + echo "RUSTC_WRAPPER=sccache" >> "$GITHUB_ENV" + + - name: Setup Rust + uses: actions-rust-lang/setup-rust-toolchain@1780873c7b576612439a134613cc4cc74ce5538c # v1.15.2 + with: + rustflags: "" + + - name: Get project name + id: project + shell: bash + run: | + name=$(cargo metadata --format-version 1 --no-deps | jq -r '.packages[0].name') + echo "name=$name" >> "$GITHUB_OUTPUT" + + - name: Build + run: cargo build --release + + - name: Create archive (Unix) + if: matrix.os != 'Windows' + shell: bash + run: | + name="${{ steps.project.outputs.name }}" + archive_name="${name}_${{ matrix.os }}_${{ matrix.arch }}.tar.gz" + tar -czvf "$archive_name" -C target/release "$name" + echo "archive_name=$archive_name" >> "$GITHUB_ENV" + + - name: Create archive (Windows) + if: matrix.os == 'Windows' + shell: pwsh + run: | + $name = "${{ steps.project.outputs.name }}" + $archiveName = "${name}_${{ matrix.os }}_${{ matrix.arch }}.zip" + Compress-Archive -Path "target/release/${name}.exe" -DestinationPath $archiveName + echo "archive_name=$archiveName" >> $env:GITHUB_ENV + + - name: Upload artifact + uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0 + with: + name: ${{ steps.project.outputs.name }}_${{ matrix.os }}_${{ matrix.arch }} + path: ${{ env.archive_name }} + if-no-files-found: error + + release: + name: Release + needs: build + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + fetch-depth: 0 + + - name: Download all artifacts + uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0 + with: + path: artifacts + merge-multiple: true + + - name: Generate changelog + id: changelog + run: | + # Get the previous tag + prev_tag=$(git describe --tags --abbrev=0 HEAD^ 2>/dev/null || echo "") + + if [ -n "$prev_tag" ]; then + echo "## Changes since $prev_tag" > changelog.md + echo "" >> changelog.md + git log --pretty=format:"- %s" "$prev_tag"..HEAD >> changelog.md + else + echo "## Initial Release" > changelog.md + fi + + - name: Create release + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh release create "${{ github.ref_name }}" \ + --title "${{ github.ref_name }}" \ + --notes-file changelog.md \ + artifacts/* diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..63ab76c --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target +.DS_Store +*.db diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..41fb4b4 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,88 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Summary + +Log ingestion tool that parses application log files and loads them into SQLite for analysis. Primary use case is analyzing mobile app telemetry - tracking feature adoption (autofill, touchID, offline login), app versions, device models, etc. + +### What it does +- Parses log lines containing `signature:` messages with app/device telemetry +- Extracts: sessionId, timestamp, app name, version, OS, model, device, and feature flags +- Converts yes/no to booleans, parses numeric usage counters +- Loads into SQLite with indexes on session_id and version for efficient queries + +### Log format +Logs are stored in `/YYYY/MM/DD/` structure, either as `.log` (plain) or `.log.gz` (gzip compressed). A single day can exceed 10GB. + +Example signature message: +``` +msg="signature:XAMARIN_APP/5.23.0/ details:offlineLoginUsage:0,isPasswordAutofillEnabled:no,cameraRollUsage:0,OS:26.2.0,appName:App,touchID:yes,isOfflineLoginEnabled:yes,model:iPhone15,3,device:iOS, Apple,passwordAutofillUsage:0 user-agent:..." +``` + +### Extensibility +The parser uses a `MessageParser` trait allowing new message types to be added. Currently only `signature:` messages are parsed; other message types are skipped. + +## Commands + +### Build & Run +```bash +# Build +cargo build + +# Release build +cargo build --release + +# Run with single file +cargo run -- --file /path/to/logs.log --output output.db + +# Run with date range +cargo run -- --from 2026/01/20 --to 2026/01/21 --base-dir /var/log/app --filename app.log --output output.db +``` + +### Testing +```bash +# Run all tests +cargo test + +# Fast test execution with cargo-nextest (recommended) +cargo nextest run + +# Run a single test +cargo test test_name +``` + +### Quality Checks +```bash +# Format check +cargo fmt -- --check + +# Apply formatting +cargo fmt + +# Static analysis with Clippy +cargo clippy +``` + +## Architecture + +### Source Files +- **src/main.rs**: CLI argument parsing (clap), orchestrates file discovery and processing +- **src/parser.rs**: Log line parsing, `MessageParser` trait, `SignatureParser` implementation +- **src/db.rs**: SQLite schema creation, batched inserts +- **src/files.rs**: File discovery by date range, handles .gz and plain files + +### Database Schema +Table `signature_entries` with columns: session_id, timestamp, app, version, offline_login_usage, is_password_autofill_enabled, camera_roll_usage, os, app_name, touch_id, is_offline_login_enabled, model, device, password_autofill_usage. + +Indexes on `session_id` and `version`. + +### Key Design Decisions +- Batched inserts (default 10k rows per transaction) for performance +- Regex-based parsing with lazy static compilation +- Extensible via `MessageParser` trait + `ParsedMessage` enum + +## CI/CD Configuration +- **ci.yaml**: Formatting, Clippy, build, and tests +- **audit.yaml**: Security audit for dependencies +- **release.yaml**: Automated release on tag push (cross-platform builds via GoReleaser) diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..1b17b8a --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,720 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" +dependencies = [ + "anstyle", + "once_cell", + "windows-sys", +] + +[[package]] +name = "anyhow" +version = "1.0.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bitflags" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" + +[[package]] +name = "bumpalo" +version = "3.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" + +[[package]] +name = "cc" +version = "1.2.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "755d2fce177175ffca841e9a06afdb2c4ab0f593d53b4dee48147dfaade85932" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "chrono" +version = "0.4.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-link", +] + +[[package]] +name = "clap" +version = "4.5.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "find-msvc-tools" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8591b0bcc8a98a64310a2fae1bb3e9b8564dd10e381e6e28010fde8e8e8568db" + +[[package]] +name = "flate2" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b375d6465b98090a5f25b1c7703f3859783755aa9a80433b36e0379a3ec2f369" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash", +] + +[[package]] +name = "hashlink" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1" +dependencies = [ + "hashbrown", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "iana-time-zone" +version = "0.1.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "js-sys" +version = "0.3.85" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.180" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" + +[[package]] +name = "libsqlite3-sys" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "947e6816f7825b2b45027c2c32e7085da9934defa535de4a6a46b10a4d5257fa" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "log_ingest" +version = "0.0.1" +dependencies = [ + "anyhow", + "chrono", + "clap", + "crossbeam-channel", + "flate2", + "rayon", + "regex", + "rusqlite", +] + +[[package]] +name = "memchr" +version = "2.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "proc-macro2" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" + +[[package]] +name = "rusqlite" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a22715a5d6deef63c637207afbe68d0c72c3f8d0022d7cf9714c442d6157606b" +dependencies = [ + "bitflags", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "smallvec", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00e2473a93778eb0bad35909dff6a10d28e63f792f16ed15e404fca9d5eeedbe" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "wasm-bindgen" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..13351e7 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "log_ingest" +version = "0.0.1" +authors = ["Alexandr Mansurov"] +edition = "2024" + +[dependencies] +clap = { version = "4.5.42", features = ["derive"] } +rusqlite = { version = "0.35", features = ["bundled"] } +chrono = "0.4" +regex = "1" +flate2 = "1" +anyhow = "1" +rayon = "1" +crossbeam-channel = "0.5" diff --git a/Cross.toml b/Cross.toml new file mode 100644 index 0000000..925b91c --- /dev/null +++ b/Cross.toml @@ -0,0 +1,2 @@ +[target.x86_64-unknown-linux-gnu] +image = "ghcr.io/cross-rs/x86_64-unknown-linux-gnu:main" diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7837a23 --- /dev/null +++ b/LICENSE @@ -0,0 +1,8 @@ +The MIT License (MIT) +Copyright (c) 2026, Alexandr Mansurov + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..87fc798 --- /dev/null +++ b/README.md @@ -0,0 +1,180 @@ +# log_ingest + +A Rust CLI tool for loading log files into a SQLite database for analysis. + +## Overview + +Parses application logs containing signature messages and loads them into SQLite for querying. Designed to handle large log volumes (10GB+ per day) with batched inserts and efficient parsing. + +## Features + +- Parse `signature:` messages extracting app info, device details, and feature flags +- Support for both plain `.log` and gzip compressed `.log.gz` files +- File discovery by date range using `YYYY/mm/dd` directory structure +- Batched inserts for performance with large files +- Parallel file processing for multi-day ingestion +- Indexed columns (`session_id`, `version`) for efficient queries +- Extensible parser architecture for adding new message types + +## Installation + +```bash +cargo build --release +``` + +## Usage + +### Process a single file + +```bash +log_ingest --file /path/to/logs.log --output output.db +``` + +### Process a date range + +```bash +log_ingest \ + --from 2026/01/20 \ + --to 2026/01/21 \ + --base-dir /var/log/myapp \ + --filename app.log \ + --output output.db +``` + +The tool will look for files at `/YYYY/MM/DD/.gz` or `/YYYY/MM/DD/` for each day in the range. + +### Parallel processing + +When processing multiple files, parsing runs in parallel by default using all available CPU cores. A single writer thread handles database inserts to avoid SQLite contention. + +```bash +# Use all CPU cores (default) +log_ingest --from 2026/01/01 --to 2026/01/31 ... + +# Limit to 4 threads +log_ingest --threads 4 --from 2026/01/01 --to 2026/01/31 ... + +# Sequential processing (disable parallelism) +log_ingest --threads 1 --from 2026/01/01 --to 2026/01/31 ... +``` + +### Options + +| Option | Description | +|--------|-------------| +| `--file ` | Single log file to process | +| `--from ` | Start date (YYYY/mm/dd) | +| `--to ` | End date (YYYY/mm/dd) | +| `--base-dir ` | Base directory containing log files | +| `--filename ` | Log filename (e.g., `app.log`) | +| `-o, --output ` | Output SQLite database path | +| `--batch-size ` | Batch size for inserts (default: 10000) | +| `--threads ` | Number of parallel threads (0 = all cores, 1 = sequential) | + +## Database Schema + +The schema uses normalized lookup tables to minimize disk usage for large datasets. + +```sql +-- Lookup tables for low-cardinality text columns +CREATE TABLE apps (id INTEGER PRIMARY KEY, name TEXT NOT NULL UNIQUE); +CREATE TABLE versions (id INTEGER PRIMARY KEY, name TEXT NOT NULL UNIQUE); +CREATE TABLE models (id INTEGER PRIMARY KEY, name TEXT NOT NULL UNIQUE); +CREATE TABLE devices (id INTEGER PRIMARY KEY, name TEXT NOT NULL UNIQUE); +CREATE TABLE os_versions (id INTEGER PRIMARY KEY, name TEXT NOT NULL UNIQUE); +CREATE TABLE app_names (id INTEGER PRIMARY KEY, name TEXT NOT NULL UNIQUE); + +-- Main table with foreign keys and millisecond timestamp +CREATE TABLE signature_entries ( + id INTEGER PRIMARY KEY, + session_id TEXT NOT NULL, + timestamp_ms INTEGER NOT NULL, -- Unix epoch milliseconds + app_id INTEGER NOT NULL REFERENCES apps(id), + version_id INTEGER NOT NULL REFERENCES versions(id), + offline_login_usage INTEGER, + is_password_autofill_enabled INTEGER, + camera_roll_usage INTEGER, + os_id INTEGER REFERENCES os_versions(id), + app_name_id INTEGER REFERENCES app_names(id), + touch_id INTEGER, + is_offline_login_enabled INTEGER, + model_id INTEGER REFERENCES models(id), + device_id INTEGER REFERENCES devices(id), + password_autofill_usage INTEGER +); + +CREATE INDEX idx_session_id ON signature_entries(session_id); +CREATE INDEX idx_timestamp ON signature_entries(timestamp_ms); +CREATE INDEX idx_version ON signature_entries(version_id); +``` + +## Example Queries + +```sql +-- Percentage of users with password autofill enabled +SELECT + ROUND(100.0 * SUM(is_password_autofill_enabled) / COUNT(*), 2) as pct +FROM signature_entries; + +-- Count by app version +SELECT v.name as version, COUNT(*) as cnt +FROM signature_entries se +JOIN versions v ON se.version_id = v.id +GROUP BY v.name +ORDER BY cnt DESC; + +-- Device breakdown +SELECT d.name as device, COUNT(*) as cnt +FROM signature_entries se +JOIN devices d ON se.device_id = d.id +GROUP BY d.name; + +-- Convert timestamp_ms to readable datetime +SELECT + datetime(timestamp_ms / 1000, 'unixepoch') as timestamp, + session_id +FROM signature_entries +LIMIT 10; +``` + +## Development + +```bash +# Build +cargo build + +# Run tests +cargo test + +# Format +cargo fmt + +# Lint +cargo clippy +``` + +## Cross-Compilation + +To build a Linux x86_64 binary from macOS: + +1. Install [cargo-zigbuild](https://github.com/rust-cross/cargo-zigbuild) and [Zig](https://ziglang.org/): + ```bash + cargo install cargo-zigbuild + brew install zig + ``` + +2. Add the Linux target: + ```bash + rustup target add x86_64-unknown-linux-gnu + ``` + +3. Build: + ```bash + cargo zigbuild --release --target x86_64-unknown-linux-gnu + ``` + +The binary will be at `target/x86_64-unknown-linux-gnu/release/log_ingest`. + +## License + +MIT diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..73328e0 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,2 @@ +[toolchain] +channel = "1.90" diff --git a/src/db.rs b/src/db.rs new file mode 100644 index 0000000..8d40d18 --- /dev/null +++ b/src/db.rs @@ -0,0 +1,166 @@ +use anyhow::Result; +use rusqlite::{params, Connection, Transaction}; +use std::collections::HashMap; + +use crate::parser::SignatureEntry; + +pub struct Database { + conn: Connection, +} + +impl Database { + pub fn new(path: &str) -> Result { + let conn = Connection::open(path)?; + + // Enable WAL mode for better concurrent read/write performance + conn.pragma_update(None, "journal_mode", "WAL")?; + + let db = Self { conn }; + db.init_schema()?; + Ok(db) + } + + fn init_schema(&self) -> Result<()> { + self.conn.execute_batch( + r#" + -- Lookup tables for low-cardinality text columns + CREATE TABLE IF NOT EXISTS apps ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE + ); + + CREATE TABLE IF NOT EXISTS versions ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE + ); + + CREATE TABLE IF NOT EXISTS models ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE + ); + + CREATE TABLE IF NOT EXISTS devices ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE + ); + + CREATE TABLE IF NOT EXISTS os_versions ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE + ); + + CREATE TABLE IF NOT EXISTS app_names ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE + ); + + -- Main table with normalized foreign keys and integer timestamp + CREATE TABLE IF NOT EXISTS signature_entries ( + id INTEGER PRIMARY KEY, + session_id TEXT NOT NULL, + timestamp_ms INTEGER NOT NULL, + app_id INTEGER NOT NULL REFERENCES apps(id), + version_id INTEGER NOT NULL REFERENCES versions(id), + offline_login_usage INTEGER, + is_password_autofill_enabled INTEGER, + camera_roll_usage INTEGER, + os_id INTEGER REFERENCES os_versions(id), + app_name_id INTEGER REFERENCES app_names(id), + touch_id INTEGER, + is_offline_login_enabled INTEGER, + model_id INTEGER REFERENCES models(id), + device_id INTEGER REFERENCES devices(id), + password_autofill_usage INTEGER + ); + + CREATE INDEX IF NOT EXISTS idx_session_id ON signature_entries(session_id); + CREATE INDEX IF NOT EXISTS idx_timestamp ON signature_entries(timestamp_ms); + CREATE INDEX IF NOT EXISTS idx_version ON signature_entries(version_id); + "#, + )?; + Ok(()) + } + + pub fn begin_transaction(&mut self) -> Result> { + Ok(self.conn.transaction()?) + } + + pub fn insert_signature_batch(tx: &Transaction<'_>, entries: &[SignatureEntry]) -> Result<()> { + // Build lookup caches for this batch + let mut app_cache: HashMap = HashMap::new(); + let mut version_cache: HashMap = HashMap::new(); + let mut model_cache: HashMap = HashMap::new(); + let mut device_cache: HashMap = HashMap::new(); + let mut os_cache: HashMap = HashMap::new(); + let mut app_name_cache: HashMap = HashMap::new(); + + let mut insert_stmt = tx.prepare_cached( + r#" + INSERT INTO signature_entries ( + id, session_id, timestamp_ms, app_id, version_id, + offline_login_usage, is_password_autofill_enabled, camera_roll_usage, + os_id, app_name_id, touch_id, is_offline_login_enabled, + model_id, device_id, password_autofill_usage + ) VALUES (NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + "#, + )?; + + for entry in entries { + let app_id = get_or_insert_lookup(tx, &mut app_cache, "apps", &entry.app)?; + let version_id = get_or_insert_lookup(tx, &mut version_cache, "versions", &entry.version)?; + let model_id = entry.model.as_ref().map(|v| get_or_insert_lookup(tx, &mut model_cache, "models", v)).transpose()?; + let device_id = entry.device.as_ref().map(|v| get_or_insert_lookup(tx, &mut device_cache, "devices", v)).transpose()?; + let os_id = entry.os.as_ref().map(|v| get_or_insert_lookup(tx, &mut os_cache, "os_versions", v)).transpose()?; + let app_name_id = entry.app_name.as_ref().map(|v| get_or_insert_lookup(tx, &mut app_name_cache, "app_names", v)).transpose()?; + + insert_stmt.execute(params![ + entry.session_id, + entry.timestamp_ms, + app_id, + version_id, + entry.offline_login_usage, + entry.is_password_autofill_enabled.map(|b| b as i32), + entry.camera_roll_usage, + os_id, + app_name_id, + entry.touch_id.map(|b| b as i32), + entry.is_offline_login_enabled.map(|b| b as i32), + model_id, + device_id, + entry.password_autofill_usage, + ])?; + } + + Ok(()) + } +} + +/// Get or insert a value into a lookup table, using a cache to minimize DB queries +fn get_or_insert_lookup( + tx: &Transaction<'_>, + cache: &mut HashMap, + table: &str, + value: &str, +) -> Result { + if let Some(&id) = cache.get(value) { + return Ok(id); + } + + // Try to find existing entry + let query = format!("SELECT id FROM {} WHERE name = ?", table); + let existing: Option = tx + .query_row(&query, params![value], |row| row.get(0)) + .ok(); + + if let Some(id) = existing { + cache.insert(value.to_string(), id); + return Ok(id); + } + + // Insert new entry + let insert = format!("INSERT INTO {} (name) VALUES (?)", table); + tx.execute(&insert, params![value])?; + let id = tx.last_insert_rowid(); + cache.insert(value.to_string(), id); + Ok(id) +} diff --git a/src/files.rs b/src/files.rs new file mode 100644 index 0000000..8f447ec --- /dev/null +++ b/src/files.rs @@ -0,0 +1,97 @@ +use anyhow::{anyhow, Result}; +use chrono::NaiveDate; +use flate2::read::GzDecoder; +use std::fs::File; +use std::io::{BufRead, BufReader}; +use std::path::PathBuf; + +/// Discovers log files for a given date range +pub struct LogFileDiscovery { + base_dir: PathBuf, + filename: String, +} + +impl LogFileDiscovery { + pub fn new(base_dir: PathBuf, filename: String) -> Self { + Self { base_dir, filename } + } + + /// Returns an iterator over all log files in the date range + pub fn discover(&self, from: NaiveDate, to: NaiveDate) -> Result> { + let mut files = Vec::new(); + + let mut current = from; + while current <= to { + if let Some(log_file) = self.find_log_for_date(current)? { + files.push(log_file); + } + current = current + .succ_opt() + .ok_or_else(|| anyhow!("Date overflow"))?; + } + + Ok(files) + } + + fn find_log_for_date(&self, date: NaiveDate) -> Result> { + // Build path: /yyyy/mm/dd/.gz or + let date_path = self + .base_dir + .join(date.format("%Y").to_string()) + .join(date.format("%m").to_string()) + .join(date.format("%d").to_string()); + + // Try gzipped first + let gz_path = date_path.join(format!("{}.gz", self.filename)); + if gz_path.exists() { + return Ok(Some(LogFile { + path: gz_path, + compressed: true, + })); + } + + // Try uncompressed + let plain_path = date_path.join(&self.filename); + if plain_path.exists() { + return Ok(Some(LogFile { + path: plain_path, + compressed: false, + })); + } + + // No file found for this date + Ok(None) + } +} + +#[derive(Debug)] +pub struct LogFile { + pub path: PathBuf, + pub compressed: bool, +} + +impl LogFile { + /// Returns a buffered reader for this log file, handling compression transparently + pub fn reader(&self) -> Result> { + let file = File::open(&self.path)?; + + if self.compressed { + let decoder = GzDecoder::new(file); + Ok(Box::new(BufReader::new(decoder))) + } else { + Ok(Box::new(BufReader::new(file))) + } + } +} + +/// For reading a single file directly (e.g., for testing) +pub fn read_log_file(path: &str) -> Result> { + let file = File::open(path)?; + + if path.ends_with(".gz") { + let decoder = GzDecoder::new(file); + Ok(Box::new(BufReader::new(decoder))) + } else { + Ok(Box::new(BufReader::new(file))) + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..5cfe42f --- /dev/null +++ b/src/main.rs @@ -0,0 +1,338 @@ +use anyhow::{anyhow, Result}; +use chrono::NaiveDate; +use clap::Parser; +use crossbeam_channel::{bounded, Sender}; +use rayon::prelude::*; +use std::collections::HashMap; +use std::io::BufRead; +use std::path::PathBuf; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use std::thread; + +mod db; +mod files; +mod parser; + +use db::Database; +use files::{read_log_file, LogFile, LogFileDiscovery}; +use parser::{ParsedMessage, ParserRegistry, SignatureEntry}; + +#[derive(Parser, Debug)] +#[command(author, version, about = "Load log files into SQLite database")] +struct Args { + /// Start date (YYYY/mm/dd) + #[arg(long)] + from: Option, + + /// End date (YYYY/mm/dd) + #[arg(long)] + to: Option, + + /// Base directory containing log files + #[arg(long)] + base_dir: Option, + + /// Log filename (without date path, e.g., "app.log") + #[arg(long)] + filename: Option, + + /// Single log file to process (alternative to date range) + #[arg(long)] + file: Option, + + /// Output SQLite database path + #[arg(long, short)] + output: String, + + /// Batch size for database inserts + #[arg(long, default_value = "10000")] + batch_size: usize, + + /// Number of parallel threads for file processing (0 = use all available cores) + #[arg(long, default_value = "0")] + threads: usize, +} + +fn parse_date(s: &str) -> Result { + NaiveDate::parse_from_str(s, "%Y/%m/%d") + .map_err(|e| anyhow!("Invalid date format '{}': {}. Expected YYYY/mm/dd", s, e)) +} + +fn main() -> Result<()> { + let args = Args::parse(); + + // Configure rayon thread pool if threads specified + if args.threads > 0 { + rayon::ThreadPoolBuilder::new() + .num_threads(args.threads) + .build_global() + .ok(); // Ignore error if pool already initialized + } + + let use_parallel = args.threads != 1; + + if let Some(file_path) = &args.file { + // Process single file (no parallelism needed) + let mut db = Database::new(&args.output)?; + let registry = ParserRegistry::new(); + eprintln!("Processing single file: {}", file_path.display()); + let reader = read_log_file(file_path.to_str().unwrap())?; + process_reader(reader, ®istry, &mut db, args.batch_size)?; + } else { + // Process date range + let from = parse_date( + args.from + .as_ref() + .ok_or_else(|| anyhow!("--from is required when not using --file"))?, + )?; + let to = parse_date( + args.to + .as_ref() + .ok_or_else(|| anyhow!("--to is required when not using --file"))?, + )?; + let base_dir = args + .base_dir + .as_ref() + .ok_or_else(|| anyhow!("--base-dir is required when not using --file"))?; + let filename = args + .filename + .as_ref() + .ok_or_else(|| anyhow!("--filename is required when not using --file"))?; + + let discovery = LogFileDiscovery::new(base_dir.clone(), filename.clone()); + let log_files = discovery.discover(from, to)?; + + if log_files.is_empty() { + eprintln!("No log files found in the specified date range"); + return Ok(()); + } + + eprintln!("Found {} log files to process", log_files.len()); + + if use_parallel && log_files.len() > 1 { + process_files_parallel(log_files, &args.output, args.batch_size)?; + } else { + process_files_sequential(log_files, &args.output, args.batch_size)?; + } + } + + eprintln!("Done!"); + Ok(()) +} + +fn process_files_sequential( + log_files: Vec, + output: &str, + batch_size: usize, +) -> Result<()> { + let mut db = Database::new(output)?; + let registry = ParserRegistry::new(); + + for log_file in log_files { + eprintln!( + "Processing: {} ({})", + log_file.path.display(), + if log_file.compressed { + "compressed" + } else { + "plain" + } + ); + let reader = log_file.reader()?; + process_reader(reader, ®istry, &mut db, batch_size)?; + } + Ok(()) +} + +fn process_files_parallel(log_files: Vec, output: &str, batch_size: usize) -> Result<()> { + let num_threads = rayon::current_num_threads(); + eprintln!("Processing {} files with {} threads", log_files.len(), num_threads); + + // Channel for sending parsed entries to the DB writer + // Buffer size: enough batches to keep workers busy without excessive memory + let (sender, receiver) = bounded::>(num_threads * 2); + + // Shared counters for progress reporting + let total_lines = Arc::new(AtomicU64::new(0)); + let parsed_lines = Arc::new(AtomicU64::new(0)); + let error_count = Arc::new(AtomicU64::new(0)); + + // Spawn DB writer thread + let output_path = output.to_string(); + let db_handle = thread::spawn(move || -> Result<()> { + let mut db = Database::new(&output_path)?; + + for batch in receiver { + let tx = db.begin_transaction()?; + Database::insert_signature_batch(&tx, &batch)?; + tx.commit()?; + } + + Ok(()) + }); + + // Process files in parallel + let result: Result<()> = log_files + .into_par_iter() + .try_for_each(|log_file| { + let file_path = log_file.path.display().to_string(); + let compressed = if log_file.compressed { "compressed" } else { "plain" }; + eprintln!("Starting: {} ({})", file_path, compressed); + + process_file_parallel( + log_file, + &sender, + batch_size, + &total_lines, + &parsed_lines, + &error_count, + )?; + + eprintln!("Finished: {}", file_path); + Ok(()) + }); + + // Close the channel so DB writer knows to stop + drop(sender); + + // Wait for DB writer to finish + db_handle + .join() + .map_err(|_| anyhow!("DB writer thread panicked"))??; + + // Print final stats + eprintln!( + "Total: {} lines read, {} parsed, {} errors", + total_lines.load(Ordering::Relaxed), + parsed_lines.load(Ordering::Relaxed), + error_count.load(Ordering::Relaxed) + ); + + result +} + +fn process_file_parallel( + log_file: LogFile, + sender: &Sender>, + batch_size: usize, + total_lines: &AtomicU64, + parsed_lines: &AtomicU64, + error_count: &AtomicU64, +) -> Result<()> { + let registry = ParserRegistry::new(); + let reader = log_file.reader()?; + + let mut batch: Vec = Vec::with_capacity(batch_size); + let mut file_lines = 0u64; + let mut file_parsed = 0u64; + let mut file_errors = 0u64; + + for line_result in reader.lines() { + let line = line_result?; + file_lines += 1; + + if let Some(parse_result) = registry.parse(&line) { + match parse_result { + Ok(ParsedMessage::Signature(entry)) => { + batch.push(entry); + file_parsed += 1; + + if batch.len() >= batch_size { + sender.send(std::mem::replace( + &mut batch, + Vec::with_capacity(batch_size), + ))?; + } + } + Err(_) => { + file_errors += 1; + } + } + } + } + + // Send remaining entries + if !batch.is_empty() { + sender.send(batch)?; + } + + // Update shared counters + total_lines.fetch_add(file_lines, Ordering::Relaxed); + parsed_lines.fetch_add(file_parsed, Ordering::Relaxed); + error_count.fetch_add(file_errors, Ordering::Relaxed); + + Ok(()) +} + +fn process_reader( + reader: Box, + registry: &ParserRegistry, + db: &mut Database, + batch_size: usize, +) -> Result<()> { + let mut signature_batch: Vec = Vec::with_capacity(batch_size); + let mut total_lines = 0u64; + let mut parsed_lines = 0u64; + let mut error_counts: HashMap = HashMap::new(); + + for line_result in reader.lines() { + let line = line_result?; + total_lines += 1; + + if let Some(parse_result) = registry.parse(&line) { + match parse_result { + Ok(ParsedMessage::Signature(entry)) => { + signature_batch.push(entry); + parsed_lines += 1; + + if signature_batch.len() >= batch_size { + flush_signature_batch(db, &mut signature_batch)?; + } + } + Err(e) => { + *error_counts.entry(e.to_string()).or_insert(0) += 1; + } + } + } + + if total_lines.is_multiple_of(100_000) { + let total_errors: u64 = error_counts.values().sum(); + eprintln!( + "Progress: {} lines read, {} parsed, {} errors", + total_lines, parsed_lines, total_errors + ); + } + } + + // Flush remaining entries + if !signature_batch.is_empty() { + flush_signature_batch(db, &mut signature_batch)?; + } + + let total_errors: u64 = error_counts.values().sum(); + eprintln!( + "File complete: {} lines read, {} parsed, {} errors", + total_lines, parsed_lines, total_errors + ); + + // Print error summary + if !error_counts.is_empty() { + eprintln!("\nParse errors breakdown:"); + let mut errors: Vec<_> = error_counts.into_iter().collect(); + errors.sort_by(|a, b| b.1.cmp(&a.1)); // Sort by count descending + for (error, count) in errors { + eprintln!(" {} ({}x)", error, count); + } + } + + Ok(()) +} + +fn flush_signature_batch(db: &mut Database, batch: &mut Vec) -> Result<()> { + let tx = db.begin_transaction()?; + Database::insert_signature_batch(&tx, batch)?; + tx.commit()?; + batch.clear(); + Ok(()) +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..584b6dd --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,658 @@ +use anyhow::{anyhow, Result}; +use chrono::NaiveDateTime; +use regex::Regex; +use std::sync::LazyLock; + +/// Represents a parsed signature log entry +#[derive(Debug, Clone, PartialEq)] +pub struct SignatureEntry { + pub session_id: String, + /// Timestamp as milliseconds since Unix epoch (UTC) + pub timestamp_ms: i64, + pub app: String, + pub version: String, + pub offline_login_usage: Option, + pub is_password_autofill_enabled: Option, + pub camera_roll_usage: Option, + pub os: Option, + pub app_name: Option, + pub touch_id: Option, + pub is_offline_login_enabled: Option, + pub model: Option, + pub device: Option, + pub password_autofill_usage: Option, +} + +/// Trait for parsing different message types from logs. +/// Implement this trait to add support for new message formats. +pub trait MessageParser: Send + Sync { + /// Attempts to parse a log line. Returns None if this parser doesn't handle this message type. + fn parse(&self, line: &str) -> Option>; +} + +/// Enum of all possible parsed message types. +/// Extend this when adding new message parsers. +#[derive(Debug, Clone)] +pub enum ParsedMessage { + Signature(SignatureEntry), +} + +static SESSION_ID_RE: LazyLock = + LazyLock::new(|| Regex::new(r"sessionId=([^,\s]+)").unwrap()); +static DATETIME_RE: LazyLock = + LazyLock::new(|| Regex::new(r#"dt="(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})(?:,(\d{3}))?"#).unwrap()); +static CORRELATION_ID_RE: LazyLock = + LazyLock::new(|| Regex::new(r"correlationId=([^,\s]+)").unwrap()); +static SIGNATURE_RE: LazyLock = + LazyLock::new(|| Regex::new(r#"msg="signature:([^/]+)/([^/]*)/\s*details:([^"]+)"#).unwrap()); + +// iOS mobile client: signature:appId/version/deviceId details:... +static MOBILE_IOS_RE: LazyLock = LazyLock::new(|| { + Regex::new(r#"msg="MOBILE_CLIENT_LOG: signature:([^/]+)/([^/]+)/([^\s]+)\s+details:([^"]+)"#) + .unwrap() +}); + +// Android mobile client: signature:appId/version/{json} details:... +static MOBILE_ANDROID_RE: LazyLock = LazyLock::new(|| { + Regex::new(r#"msg="MOBILE_CLIENT_LOG: signature:([^/]+)/([^/]+)/(\{[^}]+\})\s+details:([^"]+)"#) + .unwrap() +}); + +pub struct SignatureParser; + +impl MessageParser for SignatureParser { + fn parse(&self, line: &str) -> Option> { + // Check if this line contains a signature message (but not MOBILE_CLIENT_LOG) + if !line.contains("msg=\"signature:") { + return None; + } + + // Skip non-mobile client messages + if line.contains("msg=\"signature:WEB_CLIENT") + || line.contains("msg=\"signature:SYNC_CLIENT") + || line.contains("msg=\"signature:BROWSER_EXTENSION") + { + return None; + } + + Some(self.parse_signature_line(line)) + } +} + +impl SignatureParser { + fn parse_signature_line(&self, line: &str) -> Result { + // Extract session ID + let session_id = SESSION_ID_RE + .captures(line) + .and_then(|c| c.get(1)) + .map(|m| m.as_str().to_string()) + .ok_or_else(|| anyhow!("Missing sessionId"))?; + + // Extract timestamp as milliseconds + let timestamp_ms = extract_timestamp_ms(line)?; + + // Extract signature details + let caps = SIGNATURE_RE + .captures(line) + .ok_or_else(|| anyhow!("Invalid signature format"))?; + + let app = caps.get(1).map(|m| m.as_str().to_string()).unwrap(); + let version = caps.get(2).map(|m| m.as_str().to_string()).unwrap(); + let details_str = caps.get(3).map(|m| m.as_str()).unwrap(); + + // Parse details key-value pairs + // Handle the tricky "device:iOS, Apple" case by parsing carefully + let details = parse_details(details_str)?; + + let entry = SignatureEntry { + session_id, + timestamp_ms, + app, + version, + offline_login_usage: parse_number(&details, "offlineLoginUsage"), + is_password_autofill_enabled: parse_bool(&details, "isPasswordAutofillEnabled"), + camera_roll_usage: parse_number(&details, "cameraRollUsage"), + os: get_string(&details, "OS"), + app_name: get_string(&details, "appName"), + touch_id: parse_bool(&details, "touchID"), + is_offline_login_enabled: parse_bool(&details, "isOfflineLoginEnabled"), + model: get_string(&details, "model"), + device: get_string(&details, "device"), + password_autofill_usage: parse_number(&details, "passwordAutofillUsage"), + }; + + Ok(ParsedMessage::Signature(entry)) + } +} + +/// Parser for iOS MOBILE_CLIENT_LOG messages +pub struct MobileClientIosParser; + +impl MessageParser for MobileClientIosParser { + fn parse(&self, line: &str) -> Option> { + if !line.contains("MOBILE_CLIENT_LOG:") || !line.contains("sdk-client:IOS") { + return None; + } + + Some(self.parse_mobile_ios_line(line)) + } +} + +impl MobileClientIosParser { + fn parse_mobile_ios_line(&self, line: &str) -> Result { + let timestamp_ms = extract_timestamp_ms(line)?; + let session_id = extract_correlation_id(line)?; + + let caps = MOBILE_IOS_RE + .captures(line) + .ok_or_else(|| anyhow!("Invalid iOS mobile client format"))?; + + let app = caps.get(1).map(|m| m.as_str().to_string()).unwrap(); + let version = caps.get(2).map(|m| m.as_str().to_string()).unwrap(); + let details_str = caps.get(4).map(|m| m.as_str()).unwrap(); + + let details = parse_mobile_details(details_str); + + let entry = SignatureEntry { + session_id, + timestamp_ms, + app, + version, + offline_login_usage: None, + is_password_autofill_enabled: None, + camera_roll_usage: None, + os: get_string(&details, "os"), + app_name: get_string(&details, "app-name"), + touch_id: None, + is_offline_login_enabled: None, + model: get_string(&details, "model"), + device: Some("iOS".to_string()), + password_autofill_usage: None, + }; + + Ok(ParsedMessage::Signature(entry)) + } +} + +/// Parser for Android MOBILE_CLIENT_LOG messages +pub struct MobileClientAndroidParser; + +impl MessageParser for MobileClientAndroidParser { + fn parse(&self, line: &str) -> Option> { + if !line.contains("MOBILE_CLIENT_LOG:") || !line.contains("sdk-client:ANDROID") { + return None; + } + + Some(self.parse_mobile_android_line(line)) + } +} + +impl MobileClientAndroidParser { + fn parse_mobile_android_line(&self, line: &str) -> Result { + let timestamp_ms = extract_timestamp_ms(line)?; + let session_id = extract_correlation_id(line)?; + + let caps = MOBILE_ANDROID_RE + .captures(line) + .ok_or_else(|| anyhow!("Invalid Android mobile client format"))?; + + let app = caps.get(1).map(|m| m.as_str().to_string()).unwrap(); + let version = caps.get(2).map(|m| m.as_str().to_string()).unwrap(); + let details_str = caps.get(4).map(|m| m.as_str()).unwrap(); + + let details = parse_mobile_details_android(details_str); + + let entry = SignatureEntry { + session_id, + timestamp_ms, + app, + version, + offline_login_usage: None, + is_password_autofill_enabled: None, + camera_roll_usage: None, + os: get_string(&details, "os"), + app_name: Some("native Android".to_string()), + touch_id: None, + is_offline_login_enabled: None, + model: get_string(&details, "model"), + device: get_string(&details, "device"), + password_autofill_usage: None, + }; + + Ok(ParsedMessage::Signature(entry)) + } +} + +/// Extract timestamp from log line as milliseconds since Unix epoch +fn extract_timestamp_ms(line: &str) -> Result { + let caps = DATETIME_RE + .captures(line) + .ok_or_else(|| anyhow!("Missing datetime"))?; + + let datetime_str = caps.get(1).map(|m| m.as_str()).unwrap(); + let millis: i64 = caps + .get(2) + .map(|m| m.as_str().parse().unwrap_or(0)) + .unwrap_or(0); + + let dt = NaiveDateTime::parse_from_str(datetime_str, "%Y-%m-%d %H:%M:%S") + .map_err(|e| anyhow!("Invalid datetime format: {}", e))?; + + Ok(dt.and_utc().timestamp_millis() + millis) +} + +/// Extract correlation ID as session ID for mobile client logs +fn extract_correlation_id(line: &str) -> Result { + CORRELATION_ID_RE + .captures(line) + .and_then(|c| c.get(1)) + .map(|m| m.as_str().to_string()) + .ok_or_else(|| anyhow!("Missing correlationId")) +} + +/// Parse mobile client details for iOS (simple comma-separated key:value) +fn parse_mobile_details(details: &str) -> std::collections::HashMap { + let mut map = std::collections::HashMap::new(); + + // Keys for iOS mobile client + let known_keys = ["sdk-client", "sdk-version", "app-name", "device", "model", "os"]; + + let mut key_positions: Vec<(usize, &str)> = known_keys + .iter() + .filter_map(|&key| { + let pattern = format!("{}:", key); + details.find(&pattern).map(|pos| (pos, key)) + }) + .collect(); + + key_positions.sort_by_key(|&(pos, _)| pos); + + for i in 0..key_positions.len() { + let (pos, key) = key_positions[i]; + let value_start = pos + key.len() + 1; + + let value_end = if i + 1 < key_positions.len() { + let next_pos = key_positions[i + 1].0; + if next_pos > 0 && details.as_bytes().get(next_pos - 1) == Some(&b',') { + next_pos - 1 + } else { + next_pos + } + } else { + details.find(" user-agent").unwrap_or(details.len()) + }; + + let value = details[value_start..value_end].trim().to_string(); + map.insert(key.to_string(), value); + } + + map +} + +/// Parse mobile client details for Android (handles device with commas) +fn parse_mobile_details_android(details: &str) -> std::collections::HashMap { + let mut map = std::collections::HashMap::new(); + + // For Android, device can contain commas like "Android, samsung" + // Keys in order: sdk-client, sdk-version, app-name, device, model, os + let known_keys = ["sdk-client", "sdk-version", "app-name", "device", "model", "os"]; + + let mut key_positions: Vec<(usize, &str)> = known_keys + .iter() + .filter_map(|&key| { + let pattern = format!("{}:", key); + details.find(&pattern).map(|pos| (pos, key)) + }) + .collect(); + + key_positions.sort_by_key(|&(pos, _)| pos); + + for i in 0..key_positions.len() { + let (pos, key) = key_positions[i]; + let value_start = pos + key.len() + 1; + + let value_end = if i + 1 < key_positions.len() { + let next_pos = key_positions[i + 1].0; + // Find the comma before the next key + if next_pos > 0 && details.as_bytes().get(next_pos - 1) == Some(&b',') { + next_pos - 1 + } else { + next_pos + } + } else { + details.find(" user-agent").unwrap_or(details.len()) + }; + + let value = details[value_start..value_end].trim().to_string(); + map.insert(key.to_string(), value); + } + + map +} + +/// Parse the details string which has format like: +/// offlineLoginUsage:0,isPasswordAutofillEnabled:no,...,device:iOS, Apple,passwordAutofillUsage:0 +fn parse_details(details: &str) -> Result> { + let mut map = std::collections::HashMap::new(); + + // Known keys in order they appear + let known_keys = [ + "offlineLoginUsage", + "isPasswordAutofillEnabled", + "cameraRollUsage", + "OS", + "appName", + "touchID", + "isOfflineLoginEnabled", + "model", + "device", + "passwordAutofillUsage", + ]; + + // Find positions of each key + let mut key_positions: Vec<(usize, &str)> = known_keys + .iter() + .filter_map(|&key| { + let pattern = format!("{}:", key); + details.find(&pattern).map(|pos| (pos, key)) + }) + .collect(); + + // Sort by position + key_positions.sort_by_key(|&(pos, _)| pos); + + // Extract values between keys + for i in 0..key_positions.len() { + let (pos, key) = key_positions[i]; + let value_start = pos + key.len() + 1; // +1 for ':' + + let value_end = if i + 1 < key_positions.len() { + // Value ends at the comma before the next key + let next_pos = key_positions[i + 1].0; + // Find the comma before the next key + if next_pos > 0 && details.as_bytes().get(next_pos - 1) == Some(&b',') { + next_pos - 1 + } else { + next_pos + } + } else { + // Last key - value goes until " user-agent" or end + details + .find(" user-agent") + .unwrap_or(details.len()) + }; + + let value = details[value_start..value_end].trim().to_string(); + map.insert(key.to_string(), value); + } + + Ok(map) +} + +fn parse_number(map: &std::collections::HashMap, key: &str) -> Option { + map.get(key).and_then(|v| v.parse().ok()) +} + +fn parse_bool(map: &std::collections::HashMap, key: &str) -> Option { + map.get(key).and_then(|value| { + match value.to_lowercase().as_str() { + "yes" | "true" | "1" => Some(true), + "no" | "false" | "0" => Some(false), + _ => None, + } + }) +} + +fn get_string(map: &std::collections::HashMap, key: &str) -> Option { + map.get(key).map(|s| s.to_string()) +} + +/// Registry of all available message parsers +pub struct ParserRegistry { + parsers: Vec>, +} + +impl ParserRegistry { + pub fn new() -> Self { + let mut registry = Self { + parsers: Vec::new(), + }; + // Register default parsers (order matters - more specific first) + registry.register(Box::new(MobileClientIosParser)); + registry.register(Box::new(MobileClientAndroidParser)); + registry.register(Box::new(SignatureParser)); + registry + } + + pub fn register(&mut self, parser: Box) { + self.parsers.push(parser); + } + + /// Try to parse a line with all registered parsers + pub fn parse(&self, line: &str) -> Option> { + for parser in &self.parsers { + if let Some(result) = parser.parse(line) { + return Some(result); + } + } + None + } +} + +impl Default for ParserRegistry { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_signature_line() { + let line = r#"Jan 21 00:00:06 tom013 m1s-kv dt="2026-01-21 00:00:06,154", ll=INFO, lc=CreateSessionStep, threadId=188, externalUserId=null, clientIp=***, correlationId=***, sessionId=test-session-123, request_id=[***]***.*** t@tid@.pnull_uid_X_2631582, userId=X, msg="signature:XAMARIN_APP/5.23.0/ details:offlineLoginUsage:0,isPasswordAutofillEnabled:no,cameraRollUsage:0,OS:26.2.0,appName:App,touchID:no,isOfflineLoginEnabled:yes,model:iPhone18,1,device:iOS, Apple,passwordAutofillUsage:0 user-agent:mobileApp/5.23.0", ex=""#; + + let registry = ParserRegistry::new(); + let result = registry.parse(line).unwrap().unwrap(); + + match result { + ParsedMessage::Signature(entry) => { + assert_eq!(entry.session_id, "test-session-123"); + assert_eq!(entry.app, "XAMARIN_APP"); + assert_eq!(entry.version, "5.23.0"); + assert_eq!(entry.offline_login_usage, Some(0)); + assert_eq!(entry.is_password_autofill_enabled, Some(false)); + assert_eq!(entry.camera_roll_usage, Some(0)); + assert_eq!(entry.os, Some("26.2.0".to_string())); + assert_eq!(entry.app_name, Some("App".to_string())); + assert_eq!(entry.touch_id, Some(false)); + assert_eq!(entry.is_offline_login_enabled, Some(true)); + assert_eq!(entry.model, Some("iPhone18,1".to_string())); + assert_eq!(entry.device, Some("iOS, Apple".to_string())); + assert_eq!(entry.password_autofill_usage, Some(0)); + } + } + } + + #[test] + fn test_parse_non_signature_line() { + let line = r#"Jan 21 00:00:06 tom013 m1s-kv dt="2026-01-21 00:00:06", msg="some other message""#; + let registry = ParserRegistry::new(); + assert!(registry.parse(line).is_none()); + } + + #[test] + fn test_parse_signature_with_missing_offline_login_usage() { + // Line missing offlineLoginUsage field + let line = r#"Jan 21 00:00:06 tom013 m1s-kv dt="2026-01-21 00:00:06,154", sessionId=test-123, msg="signature:XAMARIN_APP/5.23.0/ details:isPasswordAutofillEnabled:yes,cameraRollUsage:1,OS:26.2.0,appName:App,touchID:yes,isOfflineLoginEnabled:no,model:iPhone15,3,device:iOS, Apple,passwordAutofillUsage:2 user-agent:test", ex=""#; + + let registry = ParserRegistry::new(); + let result = registry.parse(line).unwrap().unwrap(); + + match result { + ParsedMessage::Signature(entry) => { + assert_eq!(entry.session_id, "test-123"); + assert_eq!(entry.app, "XAMARIN_APP"); + assert_eq!(entry.version, "5.23.0"); + // Missing field should be None + assert_eq!(entry.offline_login_usage, None); + // Other fields should be present + assert_eq!(entry.is_password_autofill_enabled, Some(true)); + assert_eq!(entry.camera_roll_usage, Some(1)); + assert_eq!(entry.os, Some("26.2.0".to_string())); + assert_eq!(entry.app_name, Some("App".to_string())); + assert_eq!(entry.touch_id, Some(true)); + assert_eq!(entry.is_offline_login_enabled, Some(false)); + assert_eq!(entry.model, Some("iPhone15,3".to_string())); + assert_eq!(entry.device, Some("iOS, Apple".to_string())); + assert_eq!(entry.password_autofill_usage, Some(2)); + } + } + } + + #[test] + fn test_parse_signature_with_missing_password_autofill_usage() { + // Line missing passwordAutofillUsage (truncated log line scenario) + let line = r#"Jan 21 00:00:06 tom013 m1s-kv dt="2026-01-21 00:00:06,154", sessionId=test-456, msg="signature:XAMARIN_APP/5.23.0/ details:offlineLoginUsage:0,isPasswordAutofillEnabled:no,cameraRollUsage:0,OS:16.0.0,appName:App,touchID:yes,isOfflineLoginEnabled:yes,model:SM-S938B,device:Android", ex=""#; + + let registry = ParserRegistry::new(); + let result = registry.parse(line).unwrap().unwrap(); + + match result { + ParsedMessage::Signature(entry) => { + assert_eq!(entry.session_id, "test-456"); + assert_eq!(entry.app, "XAMARIN_APP"); + // passwordAutofillUsage missing + assert_eq!(entry.password_autofill_usage, None); + // Other fields present + assert_eq!(entry.offline_login_usage, Some(0)); + assert_eq!(entry.device, Some("Android".to_string())); + } + } + } + + #[test] + fn test_parse_signature_with_multiple_missing_fields() { + // Line missing multiple fields + let line = r#"Jan 21 00:00:06 tom013 m1s-kv dt="2026-01-21 00:00:06,154", sessionId=test-789, msg="signature:XAMARIN_APP/5.23.0/ details:OS:26.2.0,appName:App,model:iPhone18,1 user-agent:test", ex=""#; + + let registry = ParserRegistry::new(); + let result = registry.parse(line).unwrap().unwrap(); + + match result { + ParsedMessage::Signature(entry) => { + assert_eq!(entry.session_id, "test-789"); + assert_eq!(entry.app, "XAMARIN_APP"); + assert_eq!(entry.version, "5.23.0"); + // Missing fields should be None + assert_eq!(entry.offline_login_usage, None); + assert_eq!(entry.is_password_autofill_enabled, None); + assert_eq!(entry.camera_roll_usage, None); + assert_eq!(entry.touch_id, None); + assert_eq!(entry.is_offline_login_enabled, None); + assert_eq!(entry.device, None); + assert_eq!(entry.password_autofill_usage, None); + // Present fields should have values + assert_eq!(entry.os, Some("26.2.0".to_string())); + assert_eq!(entry.app_name, Some("App".to_string())); + assert_eq!(entry.model, Some("iPhone18,1".to_string())); + } + } + } + + #[test] + fn test_web_client_signature_is_skipped() { + // WEB_CLIENT signature messages should be skipped (return None) + let line = r#"Jan 21 00:00:06 tom013 m1s-kv dt="2026-01-21 00:00:06,154", sessionId=test-123, msg="signature:WEB_CLIENT/1.0.0/ details:browser:Chrome,OS:Windows user-agent:test", ex=""#; + + let registry = ParserRegistry::new(); + assert!(registry.parse(line).is_none()); + } + + #[test] + fn test_web_client_with_version_is_skipped() { + let line = r#"Jan 21 00:00:06 tom013 m1s-kv dt="2026-01-21 00:00:06,154", sessionId=test-456, msg="signature:WEB_CLIENT/2.5.0/ details:something:value user-agent:test", ex=""#; + + let registry = ParserRegistry::new(); + assert!(registry.parse(line).is_none()); + } + + #[test] + fn test_sync_client_signature_is_skipped() { + let line = r#"Jan 21 00:00:06 tom013 m1s-kv dt="2026-01-21 00:00:06,154", sessionId=test-789, msg="signature:SYNC_CLIENT/1.2.0/ details:platform:Windows,OS:10.0 user-agent:test", ex=""#; + + let registry = ParserRegistry::new(); + assert!(registry.parse(line).is_none()); + } + + #[test] + fn test_browser_extension_signature_is_skipped() { + let line = r#"Jan 21 00:00:06 tom013 m1s-kv dt="2026-01-21 00:00:06,154", sessionId=test-abc, msg="signature:BROWSER_EXTENSION/3.0.1/ details:browser:Firefox,OS:MacOS user-agent:test", ex=""#; + + let registry = ParserRegistry::new(); + assert!(registry.parse(line).is_none()); + } + + #[test] + fn test_parse_mobile_client_ios() { + let line = r#"Jan 21 00:01:55 tom012 m1s-kv dt="2026-01-21 00:01:55,573", ll=INFO, lc=com.xyz.domains.user.controllers.UserControllerV1, correlationId=aXAJY9JuqalC5uxAaB6EsgAAALo, applicationName=ApiGateway, msg="MOBILE_CLIENT_LOG: signature:com.xyz.app/2.1.0/738D8CD5-28BC-490C-AB02-2C309FA64875 details:sdk-client:IOS,sdk-version:1.4.0,app-name:App,device:iOS,model:iPhone14,4,os:26.2 user-agent:26.2", ex=""#; + + let registry = ParserRegistry::new(); + let result = registry.parse(line).unwrap().unwrap(); + + match result { + ParsedMessage::Signature(entry) => { + assert_eq!(entry.session_id, "aXAJY9JuqalC5uxAaB6EsgAAALo"); + assert_eq!(entry.app, "com.xyz.app"); + assert_eq!(entry.version, "2.1.0"); + assert_eq!(entry.model, Some("iPhone14,4".to_string())); + assert_eq!(entry.os, Some("26.2".to_string())); + assert_eq!(entry.app_name, Some("App".to_string())); + assert_eq!(entry.device, Some("iOS".to_string())); + // These fields are not present in mobile client logs + assert_eq!(entry.offline_login_usage, None); + assert_eq!(entry.is_password_autofill_enabled, None); + } + } + } + + #[test] + fn test_parse_mobile_client_android() { + let line = r#"Jan 21 01:01:59 tom011 m1s-kv dt="2026-01-21 01:01:59,647", ll=INFO, lc=com.xyz.domains.user.controllers.UserControllerV1, correlationId=aXAXdwzcGYAFt6MF_m4tKAAAAg4, applicationName=ApiGateway, msg="MOBILE_CLIENT_LOG: signature:com.xyz.mobile.app/2.1.0/{"client":"2a4f12c97b64948d","version":"1.4.0"} details:sdk-client:ANDROID,sdk-version:1.4.0,app-name:2.1.0,device:Android, samsung,model:SM-A536B,os:16 user-agent:16", ex=""#; + + let registry = ParserRegistry::new(); + let result = registry.parse(line).unwrap().unwrap(); + + match result { + ParsedMessage::Signature(entry) => { + assert_eq!(entry.session_id, "aXAXdwzcGYAFt6MF_m4tKAAAAg4"); + assert_eq!(entry.app, "com.xyz.mobile.app"); + assert_eq!(entry.version, "2.1.0"); + assert_eq!(entry.model, Some("SM-A536B".to_string())); + assert_eq!(entry.os, Some("16".to_string())); + assert_eq!(entry.app_name, Some("native Android".to_string())); + assert_eq!(entry.device, Some("Android, samsung".to_string())); + // These fields are not present in mobile client logs + assert_eq!(entry.offline_login_usage, None); + assert_eq!(entry.is_password_autofill_enabled, None); + } + } + } + + #[test] + fn test_xamarin_app_still_parsed() { + // Ensure XAMARIN_APP messages are still parsed after adding WEB_CLIENT filter + let line = r#"Jan 21 00:00:06 tom013 m1s-kv dt="2026-01-21 00:00:06,154", sessionId=test-session-123, msg="signature:XAMARIN_APP/5.23.0/ details:offlineLoginUsage:0,isPasswordAutofillEnabled:yes,cameraRollUsage:0,OS:26.2.0,appName:App,touchID:yes,isOfflineLoginEnabled:yes,model:iPhone18,1,device:iOS, Apple,passwordAutofillUsage:0 user-agent:test", ex=""#; + + let registry = ParserRegistry::new(); + let result = registry.parse(line).unwrap().unwrap(); + + match result { + ParsedMessage::Signature(entry) => { + assert_eq!(entry.app, "XAMARIN_APP"); + assert_eq!(entry.version, "5.23.0"); + } + } + } +}