diff --git a/apps/elf-eval/tests/real_world_job_benchmark/baseline_counts.rs b/apps/elf-eval/tests/real_world_job_benchmark/baseline_counts.rs new file mode 100644 index 00000000..9b8cd61a --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/baseline_counts.rs @@ -0,0 +1,232 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +pub(super) fn assert_dreaming_readiness_baseline_counts( + ledger: &Value, + stages: &[Value], +) -> Result<()> { + let current = support::find_by_field(stages, "/stage_id", "current_vs_historical_correctness")?; + + assert_eq!(current.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(1)); + assert_eq!(current.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), Some(5)); + assert_eq!(current.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(6)); + assert_eq!(current.pointer("/post_stage_counts/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(current.pointer("/comparison_judgment").and_then(Value::as_str), Some("improved")); + assert!( + current + .pointer("/baseline_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("five current-vs-historical jobs")) + ); + assert!( + current + .pointer("/post_stage_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("passes all six encoded jobs")) + ); + + let preference = support::find_by_field(stages, "/stage_id", "preference_evolution")?; + + assert_eq!( + preference.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(preference.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(1)); + assert_eq!( + preference.pointer("/post_stage_counts/wrong_result").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + preference.pointer("/comparison_judgment").and_then(Value::as_str), + Some("improved") + ); + + let tombstone = support::find_by_field(stages, "/stage_id", "deletion_ttl_tombstone_behavior")?; + + assert_eq!(tombstone.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(1)); + assert_eq!(tombstone.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(1)); + assert_eq!( + tombstone.pointer("/comparison_judgment").and_then(Value::as_str), + Some("unchanged") + ); + assert!( + tombstone + .pointer("/post_stage_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("tombstone and invalidation evidence")) + ); + + let consolidation = support::find_by_field(stages, "/stage_id", "reviewable_consolidation")?; + + assert_eq!( + consolidation.pointer("/comparison_judgment").and_then(Value::as_str), + Some("improved") + ); + assert_eq!( + consolidation.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), + Some(1) + ); + assert_eq!(consolidation.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(4)); + assert_eq!( + consolidation.pointer("/post_stage_counts/not_encoded").and_then(Value::as_u64), + Some(0) + ); + assert!( + consolidation + .pointer("/post_stage_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("apply/defer/discard audit") + && basis.contains("zero source mutations")) + ); + + let scheduled = support::find_by_field(stages, "/stage_id", "scheduled_memory_task_readiness")?; + + assert_eq!(scheduled.pointer("/comparison_judgment").and_then(Value::as_str), Some("improved")); + assert_eq!(scheduled.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!(scheduled.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(4)); + assert_eq!(scheduled.pointer("/post_stage_counts/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!( + scheduled.pointer("/post_stage_counts/trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + scheduled.pointer("/post_stage_counts/source_mutation_count").and_then(Value::as_u64), + Some(0) + ); + + assert_dreaming_final_competitor_retest_stage(ledger, stages)?; + assert_dreaming_memory_summary_stage(stages)?; + assert_dreaming_proactive_brief_stage(stages)?; + + Ok(()) +} + +fn assert_dreaming_final_competitor_retest_stage(ledger: &Value, stages: &[Value]) -> Result<()> { + let retest = support::find_by_field(stages, "/stage_id", "final_competitor_retest_status")?; + + assert_eq!(retest.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(22)); + assert_eq!(retest.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), Some(5)); + assert_eq!(retest.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(2)); + assert_eq!(retest.pointer("/baseline_counts/not_tested").and_then(Value::as_u64), Some(11)); + assert_eq!(retest.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), Some(11)); + assert_eq!(retest.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(40)); + assert_eq!(retest.pointer("/post_stage_counts/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(retest.pointer("/post_stage_counts/blocked").and_then(Value::as_u64), Some(7)); + assert_eq!(retest.pointer("/post_stage_counts/not_encoded").and_then(Value::as_u64), Some(19)); + assert_eq!(retest.pointer("/qmd_post_stage_counts/pass").and_then(Value::as_u64), Some(17)); + assert_eq!( + retest.pointer("/qmd_post_stage_counts/wrong_result").and_then(Value::as_u64), + Some(13) + ); + assert!(retest.pointer("/post_stage_basis").and_then(Value::as_str).is_some_and(|basis| { + basis.contains("XY-955 closeout retest") + && basis.contains("qmd live adapter materialization is 17 pass") + })); + + assert_dreaming_readiness_summary_buckets(ledger) +} + +fn assert_dreaming_readiness_summary_buckets(ledger: &Value) -> Result<()> { + assert!(support::array_contains_str( + ledger, + "/summary/improved", + "current_vs_historical_correctness" + )?); + assert!(support::array_contains_str(ledger, "/summary/improved", "preference_evolution")?); + assert!(support::array_contains_str(ledger, "/summary/improved", "reviewable_consolidation")?); + assert!(support::array_contains_str( + ledger, + "/summary/improved", + "memory_summary_top_of_mind_behavior" + )?); + assert!(support::array_contains_str(ledger, "/summary/improved", "proactive_brief_readiness")?); + assert!(support::array_contains_str( + ledger, + "/summary/improved", + "scheduled_memory_task_readiness" + )?); + assert!(support::array_at(ledger, "/summary/regressed")?.is_empty()); + assert!(support::array_contains_str( + ledger, + "/summary/unchanged", + "deletion_ttl_tombstone_behavior" + )?); + assert!(support::array_contains_str( + ledger, + "/summary/unchanged", + "final_competitor_retest_status" + )?); + assert!(support::array_at(ledger, "/summary/blocked")?.is_empty()); + assert!(support::array_at(ledger, "/summary/not_tested")?.is_empty()); + + Ok(()) +} + +fn assert_dreaming_memory_summary_stage(stages: &[Value]) -> Result<()> { + let summary_stage = + support::find_by_field(stages, "/stage_id", "memory_summary_top_of_mind_behavior")?; + + assert_eq!( + summary_stage.pointer("/comparison_judgment").and_then(Value::as_str), + Some("improved") + ); + assert_eq!(summary_stage.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(9)); + assert_eq!( + summary_stage.pointer("/post_stage_counts/not_tested").and_then(Value::as_u64), + Some(0) + ); + assert!( + summary_stage + .pointer("/post_stage_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("fixture-backed memory_summary job") + && basis.contains("unsupported-claim flags")) + ); + + Ok(()) +} + +fn assert_dreaming_proactive_brief_stage(stages: &[Value]) -> Result<()> { + let proactive_stage = support::find_by_field(stages, "/stage_id", "proactive_brief_readiness")?; + + assert_eq!( + proactive_stage.pointer("/comparison_judgment").and_then(Value::as_str), + Some("improved") + ); + assert_eq!(proactive_stage.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(4)); + assert_eq!( + proactive_stage.pointer("/post_stage_counts/blocked").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + proactive_stage.pointer("/post_stage_counts/evidence_ref_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + proactive_stage.pointer("/post_stage_counts/freshness_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + proactive_stage + .pointer("/post_stage_counts/action_rationale_coverage") + .and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + proactive_stage + .pointer("/post_stage_counts/tombstone_violation_count") + .and_then(Value::as_u64), + Some(0) + ); + assert!( + proactive_stage + .pointer("/post_stage_basis") + .and_then(Value::as_str) + .is_some_and(|basis| basis.contains("five proactive_brief fixture jobs") + && basis.contains("typed private-corpus refresh blocker")) + ); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/dreaming_readiness.rs b/apps/elf-eval/tests/real_world_job_benchmark/dreaming_readiness.rs index b1678820..0b8e946e 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/dreaming_readiness.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/dreaming_readiness.rs @@ -1,6 +1,10 @@ +mod baseline_counts; +mod header_shape; +mod markdown; + use std::fs; -use color_eyre::{Result, eyre}; +use color_eyre::Result; use serde_json::Value; use crate::support; @@ -13,333 +17,10 @@ fn dreaming_readiness_stage_ledger_preserves_gate_shape() -> Result<()> { let markdown = fs::read_to_string(support::dreaming_readiness_stage_ledger_markdown_path()?)?; let stages = support::array_at(&ledger, "/stage_gates")?; - assert_dreaming_readiness_ledger_header(&ledger)?; - assert_dreaming_readiness_stage_shape(&ledger, stages)?; - assert_dreaming_readiness_baseline_counts(&ledger, stages)?; - assert_dreaming_readiness_markdown_boundaries(&markdown); - - Ok(()) -} - -fn assert_dreaming_readiness_ledger_header(ledger: &Value) -> Result<()> { - assert_eq!( - ledger.pointer("/schema").and_then(Value::as_str), - Some("elf.dreaming_readiness_stage_ledger/v1") - ); - assert_eq!(ledger.pointer("/authority").and_then(Value::as_str), Some("XY-951")); - - for term in ["improved", "regressed", "unchanged", "blocked", "not_tested"] { - assert!(support::array_contains_str(ledger, "/judgment_terms", term)?); - } - for term in ["pass", "wrong_result", "blocked", "not_tested", "not_encoded"] { - assert!(support::array_contains_str(ledger, "/count_fields", term)?); - } - - Ok(()) -} - -fn assert_dreaming_readiness_stage_shape(ledger: &Value, stages: &[Value]) -> Result<()> { - assert_eq!(stages.len(), 8); - - for stage_id in [ - "current_vs_historical_correctness", - "preference_evolution", - "deletion_ttl_tombstone_behavior", - "reviewable_consolidation", - "memory_summary_top_of_mind_behavior", - "proactive_brief_readiness", - "scheduled_memory_task_readiness", - "final_competitor_retest_status", - ] { - support::find_by_field(stages, "/stage_id", stage_id)?; - } - for stage in stages { - let stage_id = - stage.pointer("/stage_id").and_then(Value::as_str).unwrap_or(""); - - assert!( - !support::array_at(stage, "/baseline_commands")?.is_empty(), - "{stage_id} missing baseline commands" - ); - assert!( - !support::array_at(stage, "/post_stage_commands")?.is_empty(), - "{stage_id} missing post-stage commands" - ); - assert!( - !support::array_at(stage, "/evidence_files")?.is_empty(), - "{stage_id} missing evidence files" - ); - - for count_field in support::string_array_at(ledger, "/count_fields")? { - let pointer = format!("/baseline_counts/{count_field}"); - - assert!( - stage.pointer(&pointer).and_then(Value::as_u64).is_some(), - "{stage_id} missing {pointer}" - ); - } - - let judgment = stage - .pointer("/comparison_judgment") - .and_then(Value::as_str) - .ok_or_else(|| eyre::eyre!("{stage_id} missing comparison_judgment"))?; - - assert!(support::array_contains_str(ledger, "/judgment_terms", judgment)?); - } - - Ok(()) -} - -fn assert_dreaming_readiness_baseline_counts(ledger: &Value, stages: &[Value]) -> Result<()> { - let current = support::find_by_field(stages, "/stage_id", "current_vs_historical_correctness")?; - - assert_eq!(current.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(1)); - assert_eq!(current.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), Some(5)); - assert_eq!(current.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(6)); - assert_eq!(current.pointer("/post_stage_counts/wrong_result").and_then(Value::as_u64), Some(0)); - assert_eq!(current.pointer("/comparison_judgment").and_then(Value::as_str), Some("improved")); - assert!( - current - .pointer("/baseline_basis") - .and_then(Value::as_str) - .is_some_and(|basis| basis.contains("five current-vs-historical jobs")) - ); - assert!( - current - .pointer("/post_stage_basis") - .and_then(Value::as_str) - .is_some_and(|basis| basis.contains("passes all six encoded jobs")) - ); - - let preference = support::find_by_field(stages, "/stage_id", "preference_evolution")?; - - assert_eq!( - preference.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), - Some(1) - ); - assert_eq!(preference.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(1)); - assert_eq!( - preference.pointer("/post_stage_counts/wrong_result").and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - preference.pointer("/comparison_judgment").and_then(Value::as_str), - Some("improved") - ); - - let tombstone = support::find_by_field(stages, "/stage_id", "deletion_ttl_tombstone_behavior")?; - - assert_eq!(tombstone.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(1)); - assert_eq!(tombstone.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(1)); - assert_eq!( - tombstone.pointer("/comparison_judgment").and_then(Value::as_str), - Some("unchanged") - ); - assert!( - tombstone - .pointer("/post_stage_basis") - .and_then(Value::as_str) - .is_some_and(|basis| basis.contains("tombstone and invalidation evidence")) - ); - - let consolidation = support::find_by_field(stages, "/stage_id", "reviewable_consolidation")?; - - assert_eq!( - consolidation.pointer("/comparison_judgment").and_then(Value::as_str), - Some("improved") - ); - assert_eq!( - consolidation.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), - Some(1) - ); - assert_eq!(consolidation.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(4)); - assert_eq!( - consolidation.pointer("/post_stage_counts/not_encoded").and_then(Value::as_u64), - Some(0) - ); - assert!( - consolidation - .pointer("/post_stage_basis") - .and_then(Value::as_str) - .is_some_and(|basis| basis.contains("apply/defer/discard audit") - && basis.contains("zero source mutations")) - ); - - let scheduled = support::find_by_field(stages, "/stage_id", "scheduled_memory_task_readiness")?; - - assert_eq!(scheduled.pointer("/comparison_judgment").and_then(Value::as_str), Some("improved")); - assert_eq!(scheduled.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(1)); - assert_eq!(scheduled.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(4)); - assert_eq!(scheduled.pointer("/post_stage_counts/blocked").and_then(Value::as_u64), Some(1)); - assert_eq!( - scheduled.pointer("/post_stage_counts/trace_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - scheduled.pointer("/post_stage_counts/source_mutation_count").and_then(Value::as_u64), - Some(0) - ); - - assert_dreaming_final_competitor_retest_stage(ledger, stages)?; - assert_dreaming_memory_summary_stage(stages)?; - assert_dreaming_proactive_brief_stage(stages)?; - - Ok(()) -} - -fn assert_dreaming_final_competitor_retest_stage(ledger: &Value, stages: &[Value]) -> Result<()> { - let retest = support::find_by_field(stages, "/stage_id", "final_competitor_retest_status")?; - - assert_eq!(retest.pointer("/baseline_counts/pass").and_then(Value::as_u64), Some(22)); - assert_eq!(retest.pointer("/baseline_counts/wrong_result").and_then(Value::as_u64), Some(5)); - assert_eq!(retest.pointer("/baseline_counts/blocked").and_then(Value::as_u64), Some(2)); - assert_eq!(retest.pointer("/baseline_counts/not_tested").and_then(Value::as_u64), Some(11)); - assert_eq!(retest.pointer("/baseline_counts/not_encoded").and_then(Value::as_u64), Some(11)); - assert_eq!(retest.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(40)); - assert_eq!(retest.pointer("/post_stage_counts/wrong_result").and_then(Value::as_u64), Some(0)); - assert_eq!(retest.pointer("/post_stage_counts/blocked").and_then(Value::as_u64), Some(7)); - assert_eq!(retest.pointer("/post_stage_counts/not_encoded").and_then(Value::as_u64), Some(19)); - assert_eq!(retest.pointer("/qmd_post_stage_counts/pass").and_then(Value::as_u64), Some(17)); - assert_eq!( - retest.pointer("/qmd_post_stage_counts/wrong_result").and_then(Value::as_u64), - Some(13) - ); - assert!(retest.pointer("/post_stage_basis").and_then(Value::as_str).is_some_and(|basis| { - basis.contains("XY-955 closeout retest") - && basis.contains("qmd live adapter materialization is 17 pass") - })); - - assert_dreaming_readiness_summary_buckets(ledger) -} - -fn assert_dreaming_readiness_summary_buckets(ledger: &Value) -> Result<()> { - assert!(support::array_contains_str( - ledger, - "/summary/improved", - "current_vs_historical_correctness" - )?); - assert!(support::array_contains_str(ledger, "/summary/improved", "preference_evolution")?); - assert!(support::array_contains_str(ledger, "/summary/improved", "reviewable_consolidation")?); - assert!(support::array_contains_str( - ledger, - "/summary/improved", - "memory_summary_top_of_mind_behavior" - )?); - assert!(support::array_contains_str(ledger, "/summary/improved", "proactive_brief_readiness")?); - assert!(support::array_contains_str( - ledger, - "/summary/improved", - "scheduled_memory_task_readiness" - )?); - assert!(support::array_at(ledger, "/summary/regressed")?.is_empty()); - assert!(support::array_contains_str( - ledger, - "/summary/unchanged", - "deletion_ttl_tombstone_behavior" - )?); - assert!(support::array_contains_str( - ledger, - "/summary/unchanged", - "final_competitor_retest_status" - )?); - assert!(support::array_at(ledger, "/summary/blocked")?.is_empty()); - assert!(support::array_at(ledger, "/summary/not_tested")?.is_empty()); + header_shape::assert_dreaming_readiness_ledger_header(&ledger)?; + header_shape::assert_dreaming_readiness_stage_shape(&ledger, stages)?; + baseline_counts::assert_dreaming_readiness_baseline_counts(&ledger, stages)?; + markdown::assert_dreaming_readiness_markdown_boundaries(&markdown); Ok(()) } - -fn assert_dreaming_memory_summary_stage(stages: &[Value]) -> Result<()> { - let summary_stage = - support::find_by_field(stages, "/stage_id", "memory_summary_top_of_mind_behavior")?; - - assert_eq!( - summary_stage.pointer("/comparison_judgment").and_then(Value::as_str), - Some("improved") - ); - assert_eq!(summary_stage.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(9)); - assert_eq!( - summary_stage.pointer("/post_stage_counts/not_tested").and_then(Value::as_u64), - Some(0) - ); - assert!( - summary_stage - .pointer("/post_stage_basis") - .and_then(Value::as_str) - .is_some_and(|basis| basis.contains("fixture-backed memory_summary job") - && basis.contains("unsupported-claim flags")) - ); - - Ok(()) -} - -fn assert_dreaming_proactive_brief_stage(stages: &[Value]) -> Result<()> { - let proactive_stage = support::find_by_field(stages, "/stage_id", "proactive_brief_readiness")?; - - assert_eq!( - proactive_stage.pointer("/comparison_judgment").and_then(Value::as_str), - Some("improved") - ); - assert_eq!(proactive_stage.pointer("/post_stage_counts/pass").and_then(Value::as_u64), Some(4)); - assert_eq!( - proactive_stage.pointer("/post_stage_counts/blocked").and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - proactive_stage.pointer("/post_stage_counts/evidence_ref_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - proactive_stage.pointer("/post_stage_counts/freshness_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - proactive_stage - .pointer("/post_stage_counts/action_rationale_coverage") - .and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - proactive_stage - .pointer("/post_stage_counts/tombstone_violation_count") - .and_then(Value::as_u64), - Some(0) - ); - assert!( - proactive_stage - .pointer("/post_stage_basis") - .and_then(Value::as_str) - .is_some_and(|basis| basis.contains("five proactive_brief fixture jobs") - && basis.contains("typed private-corpus refresh blocker")) - ); - - Ok(()) -} - -fn assert_dreaming_readiness_markdown_boundaries(markdown: &str) { - assert!( - markdown.contains("`improved`: current-vs-historical correctness, preference evolution") - && markdown.contains("reviewable") - && markdown.contains("proactive brief") - ); - assert!(markdown.contains("memory-summary/top-of-mind fixture readback")); - assert!(markdown.contains("XY-953 adds a direct `proactive_brief` suite")); - assert!(markdown.contains("XY-954 adds a direct `scheduled_memory` suite")); - assert!(markdown.contains( - "Do not claim fixture-backed proactive brief scoring proves OpenAI Pulse parity" - )); - assert!( - markdown - .contains("Do not claim fixture-backed scheduled-memory scoring proves ChatGPT Tasks") - ); - assert!(markdown.contains("`regressed`: none")); - assert!(markdown.contains("the XY-905 run passes all six memory-evolution jobs")); - assert!(markdown.contains("XY-952 adds a reviewable `elf.memory_summary/v1`")); - assert!(markdown.contains("XY-955 closes the final competitor retest row")); - assert!(markdown.contains("XY-905")); - assert!(markdown.contains("qmd live `pass=17`, `wrong_result=13`")); - assert!( - markdown - .contains("Do not claim this ledger proves preference history against mem0/OpenMemory") - ); - assert!(markdown.contains("Reviewable consolidation now has ELF live service-backed")); -} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/graph_rag.rs b/apps/elf-eval/tests/real_world_job_benchmark/graph_rag.rs index d60b7975..31bfb398 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/graph_rag.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/graph_rag.rs @@ -1,363 +1,7 @@ -use std::{ - env, fs, - process::{self, Command}, -}; - -use color_eyre::Result; -use serde_json::Value; - -use crate::support; - -pub(super) fn assert_graphify_adapter(adapter: &Value) -> Result<()> { - assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); - assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!(adapter.pointer("/setup/status").and_then(Value::as_str), Some("pass")); - assert_eq!(adapter.pointer("/run/status").and_then(Value::as_str), Some("pass")); - assert_eq!(adapter.pointer("/result/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - adapter.pointer("/setup/command").and_then(Value::as_str), - Some("cargo make smoke-graphify-docker-graph-report") - ); - assert_eq!( - adapter.pointer("/suites/0/suite_id").and_then(Value::as_str), - Some("knowledge_compilation") - ); - assert_eq!(adapter.pointer("/suites/0/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!(adapter.pointer("/suites/1/suite_id").and_then(Value::as_str), Some("retrieval")); - assert_eq!(adapter.pointer("/suites/1/status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - adapter.pointer("/execution_metadata/research_depth").and_then(Value::as_str), - Some( - "D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result" - ) - ); - - let capabilities = support::array_at(adapter, "/capabilities")?; - let quality = support::find_by_field(capabilities, "/capability", "quality_or_scale_claim")?; - - assert_eq!(quality.pointer("/status").and_then(Value::as_str), Some("not_encoded")); - assert!(support::array_at(adapter, "/notes")?.iter().any(|note| { - note.as_str().is_some_and(|text| text.contains("tiny smoke") && text.contains("non-pass")) - })); - - Ok(()) -} - -pub(super) fn assert_graph_rag_representative_scenarios( - ragflow: &Value, - lightrag: &Value, - graphrag: &Value, - graphiti_zep: &Value, - graphify: &Value, -) -> Result<()> { - let ragflow_scenarios = support::array_at(ragflow, "/scenarios")?; - let lightrag_scenarios = support::array_at(lightrag, "/scenarios")?; - let graphrag_scenarios = support::array_at(graphrag, "/scenarios")?; - let graphiti_scenarios = support::array_at(graphiti_zep, "/scenarios")?; - let graphify_scenarios = support::array_at(graphify, "/scenarios")?; - let ragflow_chunk = support::find_by_field( - ragflow_scenarios, - "/scenario_id", - "reference_chunk_citation_mapping", - )?; - let lightrag_context = support::find_by_field( - lightrag_scenarios, - "/scenario_id", - "context_source_reference_mapping", - )?; - let graphrag_tables = support::find_by_field( - graphrag_scenarios, - "/scenario_id", - "output_table_citation_mapping", - )?; - let graphiti_temporal = support::find_by_field( - graphiti_scenarios, - "/scenario_id", - "temporal_validity_window_mapping", - )?; - let graphify_lint = - support::find_by_field(graphify_scenarios, "/scenario_id", "graph_report_navigation_lint")?; - - assert_eq!( - ragflow_chunk.pointer("/comparison_outcome").and_then(Value::as_str), - Some("blocked") - ); - assert_eq!(lightrag_context.pointer("/status").and_then(Value::as_str), Some("incomplete")); - assert_eq!( - lightrag_context.pointer("/comparison_outcome").and_then(Value::as_str), - Some("blocked") - ); - assert_eq!( - graphrag_tables.pointer("/artifact").and_then(Value::as_str), - Some( - "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" - ) - ); - assert_eq!( - graphiti_temporal.pointer("/comparison_outcome").and_then(Value::as_str), - Some("blocked") - ); - assert_eq!(graphify_lint.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - graphify_lint.pointer("/comparison_outcome").and_then(Value::as_str), - Some("not_tested") - ); - assert!( - graphify_lint - .pointer("/evidence") - .and_then(Value::as_str) - .is_some_and(|evidence| evidence.contains("not an ELF victory claim")) - ); - - assert_adapter_matrix_rows( - ragflow_scenarios, - &[ - ("reference_chunk_citation_mapping", "blocked", "blocked"), - ("retrieval_quality_reference_recall", "blocked", "blocked"), - ("navigation_quality_document_chunks", "blocked", "blocked"), - ("answer_faithfulness_reference_chunks", "blocked", "blocked"), - ("stale_source_behavior", "not_encoded", "not_tested"), - ("knowledge_compilation_quality", "not_encoded", "not_tested"), - ], - )?; - assert_adapter_matrix_rows( - lightrag_scenarios, - &[ - ("context_source_reference_mapping", "incomplete", "blocked"), - ("retrieval_quality_context_recall", "incomplete", "blocked"), - ("citation_quality_context_references", "incomplete", "blocked"), - ("navigation_quality_graph_context", "incomplete", "blocked"), - ("answer_faithfulness_context_refs", "incomplete", "blocked"), - ("stale_source_behavior", "not_encoded", "not_tested"), - ("knowledge_compilation_quality", "not_encoded", "not_tested"), - ], - )?; - assert_adapter_matrix_rows( - graphrag_scenarios, - &[ - ("output_table_citation_mapping", "blocked", "blocked"), - ("retrieval_quality_local_search", "not_encoded", "not_tested"), - ("navigation_quality_community_graph", "blocked", "blocked"), - ("answer_faithfulness_output_tables", "blocked", "blocked"), - ("stale_source_behavior", "not_encoded", "not_tested"), - ("graph_summary_synthesis_quality", "not_encoded", "not_tested"), - ], - )?; - - Ok(()) -} - -fn assert_adapter_matrix_rows(scenarios: &[Value], expected: &[(&str, &str, &str)]) -> Result<()> { - for (scenario_id, status, outcome) in expected { - let row = support::find_by_field(scenarios, "/scenario_id", scenario_id)?; - - assert_eq!(row.pointer("/status").and_then(Value::as_str), Some(*status)); - assert_eq!(row.pointer("/comparison_outcome").and_then(Value::as_str), Some(*outcome)); - assert!( - row.pointer("/evidence") - .and_then(Value::as_str) - .is_some_and(|evidence| !evidence.trim().is_empty()) - ); - } - - Ok(()) -} +mod adapter_assertions; +mod generated_manifest; +mod representative; -#[test] -fn graphify_generated_manifest_keeps_retrieval_unscored() -> Result<()> { - let manifest = serde_json::json!({ - "schema": "elf.real_world_external_adapter_manifest/v1", - "manifest_id": "graphify-generated-manifest-test", - "docker_isolation": { - "default": true, - "compose_file": "docker-compose.baseline.yml", - "runner": "scripts/graphify-docker-graph-report-smoke.py", - "artifact_dir": "tmp/real-world-memory/graphify-smoke", - "host_global_installs_required": false, - "notes": ["Synthetic graphify generated-manifest regression test."] - }, - "adapters": [{ - "adapter_id": "graphify_docker_smoke", - "project": "graphify", - "adapter_kind": "docker_cli_graph_report_smoke", - "evidence_class": "live_real_world", - "docker_default": true, - "host_global_installs_required": false, - "overall_status": "wrong_result", - "setup": { - "status": "pass", - "evidence": "setup evidence", - "command": "cargo make smoke-graphify-docker-graph-report", - "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" - }, - "run": { - "status": "pass", - "evidence": "run evidence", - "command": "cargo make smoke-graphify-docker-graph-report", - "artifact": "tmp/real-world-memory/graphify-smoke/summary.json" - }, - "result": { - "status": "wrong_result", - "evidence": "result evidence", - "artifact": "tmp/real-world-memory/graphify-smoke/graphify-report.json" - }, - "capabilities": [{ - "capability": "quality_or_scale_claim", - "status": "not_encoded", - "evidence": "No broad graph quality claim." - }], - "suites": [ - { - "suite_id": "knowledge_compilation", - "status": "wrong_result", - "evidence": "Only the generated graph/report evidence-mapping job is represented." - }, - { - "suite_id": "retrieval", - "status": "blocked", - "evidence": "The smoke uses graphify query output only to support source mapping; broad retrieval quality is not scored." - } - ], - "evidence": [], - "execution_metadata": { - "setup_path": "cargo make smoke-graphify-docker-graph-report", - "runtime_boundary": "Docker-only generated graph/report smoke.", - "resource_expectation": "Tiny generated corpus only.", - "retry_guidance": [], - "sources": [{ - "label": "graphify", - "url": "https://github.com/safishamsi/graphify", - "evidence": "Synthetic generated-manifest regression source." - }], - "research_depth": "Generated smoke manifest path" - }, - "notes": ["tiny smoke non-pass"] - }] - }); - let temp_dir = - env::temp_dir().join(format!("elf-real-world-graphify-manifest-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - - let manifest_path = temp_dir.join("manifest.json"); - let report_path = temp_dir.join("report.json"); - - fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("run") - .arg("--fixtures") - .arg(support::fixture_dir()) - .arg("--out") - .arg(&report_path) - .arg("--external-adapter-manifest") - .arg(&manifest_path) - .output()?; - - assert!( - output.status.success(), - "real_world_job runner failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - - let report: Value = serde_json::from_slice(&fs::read(&report_path)?)?; - let adapters = support::array_at(&report, "/external_adapters/adapters")?; - let graphify = support::find_by_field(adapters, "/adapter_id", "graphify_docker_smoke")?; - let suites = support::array_at(graphify, "/suites")?; - let retrieval = support::find_by_field(suites, "/suite_id", "retrieval")?; - - assert_eq!(retrieval.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert!( - retrieval - .pointer("/evidence") - .and_then(Value::as_str) - .is_some_and(|text| { text.contains("broad retrieval quality is not scored") }) - ); - - Ok(()) -} - -#[test] -fn graph_rag_representative_fixtures_report_typed_non_pass_states() -> Result<()> { - let report = support::run_json_report_from(support::graph_rag_external_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(1)); - assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(3)); - assert_eq!( - report.pointer("/summary/knowledge/citation_coverage").and_then(Value::as_f64), - Some(0.667) - ); - assert_eq!( - report.pointer("/summary/knowledge/stale_claim_detection").and_then(Value::as_f64), - Some(0.0) - ); - assert_eq!( - report.pointer("/summary/knowledge/unsupported_summary_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), - Some(1) - ); - - let jobs = support::array_at(&report, "/jobs")?; - let ragflow = - support::find_by_field(jobs, "/job_id", "graph-rag-ragflow-reference-chunks-001")?; - let lightrag = - support::find_by_field(jobs, "/job_id", "graph-rag-lightrag-context-sources-001")?; - let graphrag = support::find_by_field(jobs, "/job_id", "graph-rag-graphrag-output-tables-001")?; - let graphiti = - support::find_by_field(jobs, "/job_id", "graph-rag-graphiti-temporal-validity-001")?; - let graphify = support::find_by_field(jobs, "/job_id", "graph-rag-graphify-graph-report-001")?; - - assert_eq!(ragflow.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(lightrag.pointer("/status").and_then(Value::as_str), Some("incomplete")); - assert_eq!(graphrag.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(graphiti.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(graphify.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - graphify.pointer("/knowledge/stale_claim_detection").and_then(Value::as_f64), - Some(0.0) - ); - assert_eq!( - graphify.pointer("/knowledge/unsupported_summary_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - graphiti.pointer("/evolution/temporal_validity_not_encoded").and_then(Value::as_bool), - Some(true) - ); - assert_eq!( - graphiti.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), - Some("graphiti.provider_boundary") - ); - assert!(support::array_contains_str( - graphiti, - "/produced_evidence", - "graphiti-current-fact-contract" - )?); - assert!(support::array_contains_str( - graphiti, - "/produced_evidence", - "graphiti-historical-fact-contract" - )?); - assert!(support::array_contains_str( - graphiti, - "/produced_evidence", - "graphiti-provider-boundary" - )?); - assert!(support::array_contains_str( - graphify, - "/produced_evidence", - "graphify-source-location-output" - )?); - - Ok(()) -} +pub(super) use adapter_assertions::{ + assert_graph_rag_representative_scenarios, assert_graphify_adapter, +}; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/graph_rag/adapter_assertions.rs b/apps/elf-eval/tests/real_world_job_benchmark/graph_rag/adapter_assertions.rs new file mode 100644 index 00000000..7a103808 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/graph_rag/adapter_assertions.rs @@ -0,0 +1,159 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +pub(in super::super) fn assert_graphify_adapter(adapter: &Value) -> Result<()> { + assert_eq!(adapter.pointer("/evidence_class").and_then(Value::as_str), Some("live_real_world")); + assert_eq!(adapter.pointer("/overall_status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(adapter.pointer("/setup/status").and_then(Value::as_str), Some("pass")); + assert_eq!(adapter.pointer("/run/status").and_then(Value::as_str), Some("pass")); + assert_eq!(adapter.pointer("/result/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + adapter.pointer("/setup/command").and_then(Value::as_str), + Some("cargo make smoke-graphify-docker-graph-report") + ); + assert_eq!( + adapter.pointer("/suites/0/suite_id").and_then(Value::as_str), + Some("knowledge_compilation") + ); + assert_eq!(adapter.pointer("/suites/0/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!(adapter.pointer("/suites/1/suite_id").and_then(Value::as_str), Some("retrieval")); + assert_eq!(adapter.pointer("/suites/1/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + adapter.pointer("/execution_metadata/research_depth").and_then(Value::as_str), + Some( + "D1 feasibility verdict plus XY-889 Docker graph/report smoke implementation and XY-900 scored smoke promotion; current Docker validation reaches graphify output and scores the tiny knowledge_compilation job as wrong_result" + ) + ); + + let capabilities = support::array_at(adapter, "/capabilities")?; + let quality = support::find_by_field(capabilities, "/capability", "quality_or_scale_claim")?; + + assert_eq!(quality.pointer("/status").and_then(Value::as_str), Some("not_encoded")); + assert!(support::array_at(adapter, "/notes")?.iter().any(|note| { + note.as_str().is_some_and(|text| text.contains("tiny smoke") && text.contains("non-pass")) + })); + + Ok(()) +} + +pub(in super::super) fn assert_graph_rag_representative_scenarios( + ragflow: &Value, + lightrag: &Value, + graphrag: &Value, + graphiti_zep: &Value, + graphify: &Value, +) -> Result<()> { + let ragflow_scenarios = support::array_at(ragflow, "/scenarios")?; + let lightrag_scenarios = support::array_at(lightrag, "/scenarios")?; + let graphrag_scenarios = support::array_at(graphrag, "/scenarios")?; + let graphiti_scenarios = support::array_at(graphiti_zep, "/scenarios")?; + let graphify_scenarios = support::array_at(graphify, "/scenarios")?; + let ragflow_chunk = support::find_by_field( + ragflow_scenarios, + "/scenario_id", + "reference_chunk_citation_mapping", + )?; + let lightrag_context = support::find_by_field( + lightrag_scenarios, + "/scenario_id", + "context_source_reference_mapping", + )?; + let graphrag_tables = support::find_by_field( + graphrag_scenarios, + "/scenario_id", + "output_table_citation_mapping", + )?; + let graphiti_temporal = support::find_by_field( + graphiti_scenarios, + "/scenario_id", + "temporal_validity_window_mapping", + )?; + let graphify_lint = + support::find_by_field(graphify_scenarios, "/scenario_id", "graph_report_navigation_lint")?; + + assert_eq!( + ragflow_chunk.pointer("/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!(lightrag_context.pointer("/status").and_then(Value::as_str), Some("incomplete")); + assert_eq!( + lightrag_context.pointer("/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!( + graphrag_tables.pointer("/artifact").and_then(Value::as_str), + Some( + "apps/elf-eval/fixtures/real_world_external_adapters/graph_rag/graphrag_output_tables_blocked.json" + ) + ); + assert_eq!( + graphiti_temporal.pointer("/comparison_outcome").and_then(Value::as_str), + Some("blocked") + ); + assert_eq!(graphify_lint.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + graphify_lint.pointer("/comparison_outcome").and_then(Value::as_str), + Some("not_tested") + ); + assert!( + graphify_lint + .pointer("/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| evidence.contains("not an ELF victory claim")) + ); + + assert_adapter_matrix_rows( + ragflow_scenarios, + &[ + ("reference_chunk_citation_mapping", "blocked", "blocked"), + ("retrieval_quality_reference_recall", "blocked", "blocked"), + ("navigation_quality_document_chunks", "blocked", "blocked"), + ("answer_faithfulness_reference_chunks", "blocked", "blocked"), + ("stale_source_behavior", "not_encoded", "not_tested"), + ("knowledge_compilation_quality", "not_encoded", "not_tested"), + ], + )?; + assert_adapter_matrix_rows( + lightrag_scenarios, + &[ + ("context_source_reference_mapping", "incomplete", "blocked"), + ("retrieval_quality_context_recall", "incomplete", "blocked"), + ("citation_quality_context_references", "incomplete", "blocked"), + ("navigation_quality_graph_context", "incomplete", "blocked"), + ("answer_faithfulness_context_refs", "incomplete", "blocked"), + ("stale_source_behavior", "not_encoded", "not_tested"), + ("knowledge_compilation_quality", "not_encoded", "not_tested"), + ], + )?; + assert_adapter_matrix_rows( + graphrag_scenarios, + &[ + ("output_table_citation_mapping", "blocked", "blocked"), + ("retrieval_quality_local_search", "not_encoded", "not_tested"), + ("navigation_quality_community_graph", "blocked", "blocked"), + ("answer_faithfulness_output_tables", "blocked", "blocked"), + ("stale_source_behavior", "not_encoded", "not_tested"), + ("graph_summary_synthesis_quality", "not_encoded", "not_tested"), + ], + )?; + + Ok(()) +} + +fn assert_adapter_matrix_rows(scenarios: &[Value], expected: &[(&str, &str, &str)]) -> Result<()> { + for (scenario_id, status, outcome) in expected { + let row = support::find_by_field(scenarios, "/scenario_id", scenario_id)?; + + assert_eq!(row.pointer("/status").and_then(Value::as_str), Some(*status)); + assert_eq!(row.pointer("/comparison_outcome").and_then(Value::as_str), Some(*outcome)); + assert!( + row.pointer("/evidence") + .and_then(Value::as_str) + .is_some_and(|evidence| !evidence.trim().is_empty()) + ); + } + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/graph_rag/generated_manifest.rs b/apps/elf-eval/tests/real_world_job_benchmark/graph_rag/generated_manifest.rs new file mode 100644 index 00000000..21d0e83e --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/graph_rag/generated_manifest.rs @@ -0,0 +1,123 @@ +use std::{ + env, fs, + process::{self, Command}, +}; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn graphify_generated_manifest_keeps_retrieval_unscored() -> Result<()> { + let manifest = serde_json::json!({ + "schema": "elf.real_world_external_adapter_manifest/v1", + "manifest_id": "graphify-generated-manifest-test", + "docker_isolation": { + "default": true, + "compose_file": "docker-compose.baseline.yml", + "runner": "scripts/graphify-docker-graph-report-smoke.py", + "artifact_dir": "tmp/real-world-memory/graphify-smoke", + "host_global_installs_required": false, + "notes": ["Synthetic graphify generated-manifest regression test."] + }, + "adapters": [{ + "adapter_id": "graphify_docker_smoke", + "project": "graphify", + "adapter_kind": "docker_cli_graph_report_smoke", + "evidence_class": "live_real_world", + "docker_default": true, + "host_global_installs_required": false, + "overall_status": "wrong_result", + "setup": { + "status": "pass", + "evidence": "setup evidence", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-smoke.json" + }, + "run": { + "status": "pass", + "evidence": "run evidence", + "command": "cargo make smoke-graphify-docker-graph-report", + "artifact": "tmp/real-world-memory/graphify-smoke/summary.json" + }, + "result": { + "status": "wrong_result", + "evidence": "result evidence", + "artifact": "tmp/real-world-memory/graphify-smoke/graphify-report.json" + }, + "capabilities": [{ + "capability": "quality_or_scale_claim", + "status": "not_encoded", + "evidence": "No broad graph quality claim." + }], + "suites": [ + { + "suite_id": "knowledge_compilation", + "status": "wrong_result", + "evidence": "Only the generated graph/report evidence-mapping job is represented." + }, + { + "suite_id": "retrieval", + "status": "blocked", + "evidence": "The smoke uses graphify query output only to support source mapping; broad retrieval quality is not scored." + } + ], + "evidence": [], + "execution_metadata": { + "setup_path": "cargo make smoke-graphify-docker-graph-report", + "runtime_boundary": "Docker-only generated graph/report smoke.", + "resource_expectation": "Tiny generated corpus only.", + "retry_guidance": [], + "sources": [{ + "label": "graphify", + "url": "https://github.com/safishamsi/graphify", + "evidence": "Synthetic generated-manifest regression source." + }], + "research_depth": "Generated smoke manifest path" + }, + "notes": ["tiny smoke non-pass"] + }] + }); + let temp_dir = + env::temp_dir().join(format!("elf-real-world-graphify-manifest-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let manifest_path = temp_dir.join("manifest.json"); + let report_path = temp_dir.join("report.json"); + + fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::fixture_dir()) + .arg("--out") + .arg(&report_path) + .arg("--external-adapter-manifest") + .arg(&manifest_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job runner failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let report: Value = serde_json::from_slice(&fs::read(&report_path)?)?; + let adapters = support::array_at(&report, "/external_adapters/adapters")?; + let graphify = support::find_by_field(adapters, "/adapter_id", "graphify_docker_smoke")?; + let suites = support::array_at(graphify, "/suites")?; + let retrieval = support::find_by_field(suites, "/suite_id", "retrieval")?; + + assert_eq!(retrieval.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert!( + retrieval + .pointer("/evidence") + .and_then(Value::as_str) + .is_some_and(|text| { text.contains("broad retrieval quality is not scored") }) + ); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/graph_rag/representative.rs b/apps/elf-eval/tests/real_world_job_benchmark/graph_rag/representative.rs new file mode 100644 index 00000000..3207888e --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/graph_rag/representative.rs @@ -0,0 +1,89 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn graph_rag_representative_fixtures_report_typed_non_pass_states() -> Result<()> { + let report = support::run_json_report_from(support::graph_rag_external_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(1)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(3)); + assert_eq!( + report.pointer("/summary/knowledge/citation_coverage").and_then(Value::as_f64), + Some(0.667) + ); + assert_eq!( + report.pointer("/summary/knowledge/stale_claim_detection").and_then(Value::as_f64), + Some(0.0) + ); + assert_eq!( + report.pointer("/summary/knowledge/unsupported_summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/temporal_validity_not_encoded_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), + Some(1) + ); + + let jobs = support::array_at(&report, "/jobs")?; + let ragflow = + support::find_by_field(jobs, "/job_id", "graph-rag-ragflow-reference-chunks-001")?; + let lightrag = + support::find_by_field(jobs, "/job_id", "graph-rag-lightrag-context-sources-001")?; + let graphrag = support::find_by_field(jobs, "/job_id", "graph-rag-graphrag-output-tables-001")?; + let graphiti = + support::find_by_field(jobs, "/job_id", "graph-rag-graphiti-temporal-validity-001")?; + let graphify = support::find_by_field(jobs, "/job_id", "graph-rag-graphify-graph-report-001")?; + + assert_eq!(ragflow.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(lightrag.pointer("/status").and_then(Value::as_str), Some("incomplete")); + assert_eq!(graphrag.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(graphiti.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(graphify.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + graphify.pointer("/knowledge/stale_claim_detection").and_then(Value::as_f64), + Some(0.0) + ); + assert_eq!( + graphify.pointer("/knowledge/unsupported_summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + graphiti.pointer("/evolution/temporal_validity_not_encoded").and_then(Value::as_bool), + Some(true) + ); + assert_eq!( + graphiti.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("graphiti.provider_boundary") + ); + assert!(support::array_contains_str( + graphiti, + "/produced_evidence", + "graphiti-current-fact-contract" + )?); + assert!(support::array_contains_str( + graphiti, + "/produced_evidence", + "graphiti-historical-fact-contract" + )?); + assert!(support::array_contains_str( + graphiti, + "/produced_evidence", + "graphiti-provider-boundary" + )?); + assert!(support::array_contains_str( + graphify, + "/produced_evidence", + "graphify-source-location-output" + )?); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/header_shape.rs b/apps/elf-eval/tests/real_world_job_benchmark/header_shape.rs new file mode 100644 index 00000000..fb0ecefe --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/header_shape.rs @@ -0,0 +1,76 @@ +use color_eyre::{Result, eyre}; +use serde_json::Value; + +use crate::support; + +pub(super) fn assert_dreaming_readiness_ledger_header(ledger: &Value) -> Result<()> { + assert_eq!( + ledger.pointer("/schema").and_then(Value::as_str), + Some("elf.dreaming_readiness_stage_ledger/v1") + ); + assert_eq!(ledger.pointer("/authority").and_then(Value::as_str), Some("XY-951")); + + for term in ["improved", "regressed", "unchanged", "blocked", "not_tested"] { + assert!(support::array_contains_str(ledger, "/judgment_terms", term)?); + } + for term in ["pass", "wrong_result", "blocked", "not_tested", "not_encoded"] { + assert!(support::array_contains_str(ledger, "/count_fields", term)?); + } + + Ok(()) +} + +pub(super) fn assert_dreaming_readiness_stage_shape( + ledger: &Value, + stages: &[Value], +) -> Result<()> { + assert_eq!(stages.len(), 8); + + for stage_id in [ + "current_vs_historical_correctness", + "preference_evolution", + "deletion_ttl_tombstone_behavior", + "reviewable_consolidation", + "memory_summary_top_of_mind_behavior", + "proactive_brief_readiness", + "scheduled_memory_task_readiness", + "final_competitor_retest_status", + ] { + support::find_by_field(stages, "/stage_id", stage_id)?; + } + for stage in stages { + let stage_id = + stage.pointer("/stage_id").and_then(Value::as_str).unwrap_or(""); + + assert!( + !support::array_at(stage, "/baseline_commands")?.is_empty(), + "{stage_id} missing baseline commands" + ); + assert!( + !support::array_at(stage, "/post_stage_commands")?.is_empty(), + "{stage_id} missing post-stage commands" + ); + assert!( + !support::array_at(stage, "/evidence_files")?.is_empty(), + "{stage_id} missing evidence files" + ); + + for count_field in support::string_array_at(ledger, "/count_fields")? { + let pointer = format!("/baseline_counts/{count_field}"); + + assert!( + stage.pointer(&pointer).and_then(Value::as_u64).is_some(), + "{stage_id} missing {pointer}" + ); + } + + let judgment = stage + .pointer("/comparison_judgment") + .and_then(Value::as_str) + .ok_or_else(|| eyre::eyre!("{stage_id} missing comparison_judgment"))?; + + assert!(support::array_contains_str(ledger, "/judgment_terms", judgment)?); + } + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/markdown.rs b/apps/elf-eval/tests/real_world_job_benchmark/markdown.rs new file mode 100644 index 00000000..c9520929 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/markdown.rs @@ -0,0 +1,28 @@ +pub(super) fn assert_dreaming_readiness_markdown_boundaries(markdown: &str) { + assert!( + markdown.contains("`improved`: current-vs-historical correctness, preference evolution") + && markdown.contains("reviewable") + && markdown.contains("proactive brief") + ); + assert!(markdown.contains("memory-summary/top-of-mind fixture readback")); + assert!(markdown.contains("XY-953 adds a direct `proactive_brief` suite")); + assert!(markdown.contains("XY-954 adds a direct `scheduled_memory` suite")); + assert!(markdown.contains( + "Do not claim fixture-backed proactive brief scoring proves OpenAI Pulse parity" + )); + assert!( + markdown + .contains("Do not claim fixture-backed scheduled-memory scoring proves ChatGPT Tasks") + ); + assert!(markdown.contains("`regressed`: none")); + assert!(markdown.contains("the XY-905 run passes all six memory-evolution jobs")); + assert!(markdown.contains("XY-952 adds a reviewable `elf.memory_summary/v1`")); + assert!(markdown.contains("XY-955 closes the final competitor retest row")); + assert!(markdown.contains("XY-905")); + assert!(markdown.contains("qmd live `pass=17`, `wrong_result=13`")); + assert!( + markdown + .contains("Do not claim this ledger proves preference history against mem0/OpenMemory") + ); + assert!(markdown.contains("Reviewable consolidation now has ELF live service-backed")); +} diff --git a/packages/elf-service/tests/acceptance/chunk_search/tests_helpers.rs b/packages/elf-service/tests/acceptance/chunk_search/tests_helpers.rs index eaba97a7..4547890a 100644 --- a/packages/elf-service/tests/acceptance/chunk_search/tests_helpers.rs +++ b/packages/elf-service/tests/acceptance/chunk_search/tests_helpers.rs @@ -1,350 +1,14 @@ -use std::{ - collections::HashMap, - sync::{Arc, atomic::AtomicUsize}, +mod context; +mod inserts; +mod qdrant; +mod rerank; + +pub(super) use self::{ + context::{TestContext, reset_collection, setup_context}, + inserts::{ + insert_chunk, insert_note, insert_note_with_importance, + insert_note_with_importance_and_source_ref, insert_summary_field_row, + }, + qdrant::upsert_point, + rerank::{KeywordRerank, build_providers}, }; - -use qdrant_client::{ - Payload, - qdrant::{Document, PointStruct, UpsertPointsBuilder, Vector}, -}; -use serde_json::Value; -use sqlx::PgExecutor; -use time::OffsetDateTime; -use uuid::Uuid; - -use crate::acceptance::{self, SpyExtractor, StubEmbedding}; -use elf_config::ProviderConfig; -use elf_service::{BoxFuture, ElfService, Providers, RerankProvider, Result}; -use elf_storage::qdrant::{BM25_MODEL, BM25_VECTOR_NAME, DENSE_VECTOR_NAME}; -use elf_testkit::TestDatabase; - -pub(super) struct TestContext { - pub(super) service: ElfService, - pub(super) test_db: TestDatabase, - pub(super) embedding_version: String, -} - -pub(super) struct KeywordRerank { - pub(super) keyword: &'static str, -} -impl RerankProvider for KeywordRerank { - fn rerank<'a>( - &'a self, - _cfg: &'a ProviderConfig, - _query: &'a str, - docs: &'a [String], - ) -> BoxFuture<'a, Result>> { - let keyword = self.keyword; - - Box::pin(async move { - Ok(docs.iter().map(|doc| if doc.contains(keyword) { 1.0 } else { 0.1 }).collect()) - }) - } -} - -pub(super) fn build_providers(rerank: R) -> Providers -where - R: RerankProvider + Send + Sync + 'static, -{ - Providers::new( - Arc::new(StubEmbedding { vector_dim: 4_096 }), - Arc::new(rerank), - Arc::new(SpyExtractor { - calls: Arc::new(AtomicUsize::new(0)), - payload: serde_json::json!({ "notes": [] }), - }), - ) -} - -pub(super) fn build_payload( - note_id: Uuid, - chunk_id: Uuid, - chunk_index: i32, - start_offset: i32, - end_offset: i32, -) -> Payload { - let mut payload = Payload::new(); - - payload.insert("note_id", note_id.to_string()); - payload.insert("chunk_id", chunk_id.to_string()); - payload.insert("chunk_index", Value::from(chunk_index)); - payload.insert("start_offset", Value::from(start_offset)); - payload.insert("end_offset", Value::from(end_offset)); - payload.insert("tenant_id", "t"); - payload.insert("project_id", "p"); - payload.insert("agent_id", "a"); - payload.insert("scope", "agent_private"); - payload.insert("status", "active"); - - payload -} - -pub(super) fn build_vectors(text: &str) -> HashMap { - let mut vectors = HashMap::new(); - - vectors.insert(DENSE_VECTOR_NAME.to_string(), Vector::from(vec![0.0_f32; 4_096])); - vectors.insert( - BM25_VECTOR_NAME.to_string(), - Vector::from(Document::new(text.to_string(), BM25_MODEL)), - ); - - vectors -} - -pub(super) async fn setup_context(test_name: &str, providers: Providers) -> Option { - let Some(test_db) = acceptance::test_db().await else { - eprintln!("Skipping {test_name}; set ELF_PG_DSN to run this test."); - - return None; - }; - let Some(qdrant_url) = acceptance::test_qdrant_url() else { - eprintln!("Skipping {test_name}; set ELF_QDRANT_URL to run this test."); - - return None; - }; - let collection = test_db.collection_name("elf_acceptance"); - let docs_collection = test_db.collection_name("elf_acceptance_docs"); - let cfg = acceptance::test_config( - test_db.dsn().to_string(), - qdrant_url, - 4_096, - collection, - docs_collection, - ); - let service = - acceptance::build_service(cfg, providers).await.expect("Failed to build service."); - - acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); - - reset_collection(&service).await; - - let embedding_version = format!( - "{}:{}:{}", - service.cfg.providers.embedding.provider_id, - service.cfg.providers.embedding.model, - service.cfg.storage.qdrant.vector_dim - ); - - Some(TestContext { service, test_db, embedding_version }) -} - -pub(super) async fn reset_collection(service: &ElfService) { - acceptance::reset_qdrant_collection( - &service.qdrant.client, - &service.qdrant.collection, - service.qdrant.vector_dim, - ) - .await - .expect("Failed to reset Qdrant collection."); -} - -pub(super) async fn insert_note<'e, E>( - executor: E, - note_id: Uuid, - note_text: &str, - embedding_version: &str, -) where - E: PgExecutor<'e>, -{ - insert_note_with_importance_and_source_ref( - executor, - note_id, - note_text, - embedding_version, - 0.4_f32, - 0.9_f32, - "agent_private", - serde_json::json!({}), - ) - .await; -} - -pub(super) async fn insert_note_with_importance<'e, E>( - executor: E, - note_id: Uuid, - note_text: &str, - embedding_version: &str, - importance: f32, - confidence: f32, - scope: &str, -) where - E: PgExecutor<'e>, -{ - insert_note_with_importance_and_source_ref( - executor, - note_id, - note_text, - embedding_version, - importance, - confidence, - scope, - serde_json::json!({}), - ) - .await; -} - -#[allow(clippy::too_many_arguments)] -pub(super) async fn insert_note_with_importance_and_source_ref<'e, E>( - executor: E, - note_id: Uuid, - note_text: &str, - embedding_version: &str, - importance: f32, - confidence: f32, - scope: &str, - source_ref: Value, -) where - E: PgExecutor<'e>, -{ - let now = OffsetDateTime::now_utc(); - - sqlx::query( - "\ -INSERT INTO memory_notes ( - note_id, - tenant_id, - project_id, - agent_id, - scope, - type, - key, - text, - importance, - confidence, - status, - created_at, - updated_at, - expires_at, - embedding_version, - source_ref, - hit_count, - last_hit_at -) -VALUES ( - $1, - $2, - $3, - $4, - $5, - $6, - $7, - $8, - $9, - $10, - $11, - $12, - $13, - $14, - $15, - $16, - $17, - $18 -)", - ) - .bind(note_id) - .bind("t") - .bind("p") - .bind("a") - .bind(scope) - .bind("fact") - .bind(Option::::None) - .bind(note_text) - .bind(importance) - .bind(confidence) - .bind("active") - .bind(now) - .bind(now) - .bind(Option::::None) - .bind(embedding_version) - .bind(source_ref) - .bind(0_i64) - .bind(Option::::None) - .execute(executor) - .await - .expect("Failed to insert memory note."); -} - -#[allow(clippy::too_many_arguments)] -pub(super) async fn insert_summary_field_row<'e, E>( - executor: E, - field_id: Uuid, - note_id: Uuid, - summary: &str, -) where - E: PgExecutor<'e>, -{ - sqlx::query( - "\ -INSERT INTO memory_note_fields (field_id, note_id, field_kind, item_index, text) -VALUES ($1, $2, $3, $4, $5)", - ) - .bind(field_id) - .bind(note_id) - .bind("summary") - .bind(0_i32) - .bind(summary) - .execute(executor) - .await - .expect("Failed to insert note summary field."); -} - -#[allow(clippy::too_many_arguments)] -pub(super) async fn insert_chunk<'e, E>( - executor: E, - chunk_id: Uuid, - note_id: Uuid, - chunk_index: i32, - start_offset: i32, - end_offset: i32, - text: &str, - embedding_version: &str, -) where - E: PgExecutor<'e>, -{ - sqlx::query( - "\ -INSERT INTO memory_note_chunks ( - chunk_id, - note_id, - chunk_index, - start_offset, - end_offset, - text, - embedding_version -) -VALUES ($1, $2, $3, $4, $5, $6, $7)", - ) - .bind(chunk_id) - .bind(note_id) - .bind(chunk_index) - .bind(start_offset) - .bind(end_offset) - .bind(text) - .bind(embedding_version) - .execute(executor) - .await - .expect("Failed to insert chunk metadata."); -} - -pub(super) async fn upsert_point( - service: &ElfService, - chunk_id: Uuid, - note_id: Uuid, - chunk_index: i32, - start_offset: i32, - end_offset: i32, - text: &str, -) { - let payload = build_payload(note_id, chunk_id, chunk_index, start_offset, end_offset); - let vectors = build_vectors(text); - let point = PointStruct::new(chunk_id.to_string(), vectors, payload); - - service - .qdrant - .client - .upsert_points( - UpsertPointsBuilder::new(service.qdrant.collection.clone(), vec![point]).wait(true), - ) - .await - .expect("Failed to upsert Qdrant point."); -} diff --git a/packages/elf-service/tests/acceptance/chunk_search/tests_helpers/context.rs b/packages/elf-service/tests/acceptance/chunk_search/tests_helpers/context.rs new file mode 100644 index 00000000..29cf4af3 --- /dev/null +++ b/packages/elf-service/tests/acceptance/chunk_search/tests_helpers/context.rs @@ -0,0 +1,59 @@ +use crate::acceptance; +use elf_service::{ElfService, Providers}; +use elf_testkit::TestDatabase; + +pub(in super::super) struct TestContext { + pub(in super::super) service: ElfService, + pub(in super::super) test_db: TestDatabase, + pub(in super::super) embedding_version: String, +} + +pub(in super::super) async fn setup_context( + test_name: &str, + providers: Providers, +) -> Option { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping {test_name}; set ELF_PG_DSN to run this test."); + + return None; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping {test_name}; set ELF_QDRANT_URL to run this test."); + + return None; + }; + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + + reset_collection(&service).await; + + let embedding_version = format!( + "{}:{}:{}", + service.cfg.providers.embedding.provider_id, + service.cfg.providers.embedding.model, + service.cfg.storage.qdrant.vector_dim + ); + + Some(TestContext { service, test_db, embedding_version }) +} + +pub(in super::super) async fn reset_collection(service: &ElfService) { + acceptance::reset_qdrant_collection( + &service.qdrant.client, + &service.qdrant.collection, + service.qdrant.vector_dim, + ) + .await + .expect("Failed to reset Qdrant collection."); +} diff --git a/packages/elf-service/tests/acceptance/chunk_search/tests_helpers/inserts.rs b/packages/elf-service/tests/acceptance/chunk_search/tests_helpers/inserts.rs new file mode 100644 index 00000000..51a6d15a --- /dev/null +++ b/packages/elf-service/tests/acceptance/chunk_search/tests_helpers/inserts.rs @@ -0,0 +1,192 @@ +use serde_json::Value; +use sqlx::PgExecutor; +use time::OffsetDateTime; +use uuid::Uuid; + +pub(in super::super) async fn insert_note<'e, E>( + executor: E, + note_id: Uuid, + note_text: &str, + embedding_version: &str, +) where + E: PgExecutor<'e>, +{ + insert_note_with_importance_and_source_ref( + executor, + note_id, + note_text, + embedding_version, + 0.4_f32, + 0.9_f32, + "agent_private", + serde_json::json!({}), + ) + .await; +} + +pub(in super::super) async fn insert_note_with_importance<'e, E>( + executor: E, + note_id: Uuid, + note_text: &str, + embedding_version: &str, + importance: f32, + confidence: f32, + scope: &str, +) where + E: PgExecutor<'e>, +{ + insert_note_with_importance_and_source_ref( + executor, + note_id, + note_text, + embedding_version, + importance, + confidence, + scope, + serde_json::json!({}), + ) + .await; +} + +#[allow(clippy::too_many_arguments)] +pub(in super::super) async fn insert_note_with_importance_and_source_ref<'e, E>( + executor: E, + note_id: Uuid, + note_text: &str, + embedding_version: &str, + importance: f32, + confidence: f32, + scope: &str, + source_ref: Value, +) where + E: PgExecutor<'e>, +{ + let now = OffsetDateTime::now_utc(); + + sqlx::query( + "\ +INSERT INTO memory_notes ( + note_id, + tenant_id, + project_id, + agent_id, + scope, + type, + key, + text, + importance, + confidence, + status, + created_at, + updated_at, + expires_at, + embedding_version, + source_ref, + hit_count, + last_hit_at +) +VALUES ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9, + $10, + $11, + $12, + $13, + $14, + $15, + $16, + $17, + $18 +)", + ) + .bind(note_id) + .bind("t") + .bind("p") + .bind("a") + .bind(scope) + .bind("fact") + .bind(Option::::None) + .bind(note_text) + .bind(importance) + .bind(confidence) + .bind("active") + .bind(now) + .bind(now) + .bind(Option::::None) + .bind(embedding_version) + .bind(source_ref) + .bind(0_i64) + .bind(Option::::None) + .execute(executor) + .await + .expect("Failed to insert memory note."); +} + +#[allow(clippy::too_many_arguments)] +pub(in super::super) async fn insert_summary_field_row<'e, E>( + executor: E, + field_id: Uuid, + note_id: Uuid, + summary: &str, +) where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO memory_note_fields (field_id, note_id, field_kind, item_index, text) +VALUES ($1, $2, $3, $4, $5)", + ) + .bind(field_id) + .bind(note_id) + .bind("summary") + .bind(0_i32) + .bind(summary) + .execute(executor) + .await + .expect("Failed to insert note summary field."); +} + +#[allow(clippy::too_many_arguments)] +pub(in super::super) async fn insert_chunk<'e, E>( + executor: E, + chunk_id: Uuid, + note_id: Uuid, + chunk_index: i32, + start_offset: i32, + end_offset: i32, + text: &str, + embedding_version: &str, +) where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO memory_note_chunks ( + chunk_id, + note_id, + chunk_index, + start_offset, + end_offset, + text, + embedding_version +) +VALUES ($1, $2, $3, $4, $5, $6, $7)", + ) + .bind(chunk_id) + .bind(note_id) + .bind(chunk_index) + .bind(start_offset) + .bind(end_offset) + .bind(text) + .bind(embedding_version) + .execute(executor) + .await + .expect("Failed to insert chunk metadata."); +} diff --git a/packages/elf-service/tests/acceptance/chunk_search/tests_helpers/qdrant.rs b/packages/elf-service/tests/acceptance/chunk_search/tests_helpers/qdrant.rs new file mode 100644 index 00000000..a6663d8d --- /dev/null +++ b/packages/elf-service/tests/acceptance/chunk_search/tests_helpers/qdrant.rs @@ -0,0 +1,70 @@ +use std::collections::HashMap; + +use qdrant_client::{ + Payload, + qdrant::{Document, PointStruct, UpsertPointsBuilder, Vector}, +}; +use serde_json::Value; +use uuid::Uuid; + +use elf_service::ElfService; +use elf_storage::qdrant::{BM25_MODEL, BM25_VECTOR_NAME, DENSE_VECTOR_NAME}; + +#[allow(clippy::too_many_arguments)] +pub(in super::super) async fn upsert_point( + service: &ElfService, + chunk_id: Uuid, + note_id: Uuid, + chunk_index: i32, + start_offset: i32, + end_offset: i32, + text: &str, +) { + let payload = build_payload(note_id, chunk_id, chunk_index, start_offset, end_offset); + let vectors = build_vectors(text); + let point = PointStruct::new(chunk_id.to_string(), vectors, payload); + + service + .qdrant + .client + .upsert_points( + UpsertPointsBuilder::new(service.qdrant.collection.clone(), vec![point]).wait(true), + ) + .await + .expect("Failed to upsert Qdrant point."); +} + +fn build_payload( + note_id: Uuid, + chunk_id: Uuid, + chunk_index: i32, + start_offset: i32, + end_offset: i32, +) -> Payload { + let mut payload = Payload::new(); + + payload.insert("note_id", note_id.to_string()); + payload.insert("chunk_id", chunk_id.to_string()); + payload.insert("chunk_index", Value::from(chunk_index)); + payload.insert("start_offset", Value::from(start_offset)); + payload.insert("end_offset", Value::from(end_offset)); + payload.insert("tenant_id", "t"); + payload.insert("project_id", "p"); + payload.insert("agent_id", "a"); + payload.insert("scope", "agent_private"); + payload.insert("status", "active"); + + payload +} + +fn build_vectors(text: &str) -> HashMap { + let mut vectors = HashMap::new(); + + vectors.insert(DENSE_VECTOR_NAME.to_string(), Vector::from(vec![0.0_f32; 4_096])); + vectors.insert( + BM25_VECTOR_NAME.to_string(), + Vector::from(Document::new(text.to_string(), BM25_MODEL)), + ); + + vectors +} diff --git a/packages/elf-service/tests/acceptance/chunk_search/tests_helpers/rerank.rs b/packages/elf-service/tests/acceptance/chunk_search/tests_helpers/rerank.rs new file mode 100644 index 00000000..792053ee --- /dev/null +++ b/packages/elf-service/tests/acceptance/chunk_search/tests_helpers/rerank.rs @@ -0,0 +1,37 @@ +use std::sync::{Arc, atomic::AtomicUsize}; + +use crate::acceptance::{SpyExtractor, StubEmbedding}; +use elf_config::ProviderConfig; +use elf_service::{BoxFuture, Providers, RerankProvider, Result}; + +pub(in super::super) struct KeywordRerank { + pub(in super::super) keyword: &'static str, +} +impl RerankProvider for KeywordRerank { + fn rerank<'a>( + &'a self, + _cfg: &'a ProviderConfig, + _query: &'a str, + docs: &'a [String], + ) -> BoxFuture<'a, Result>> { + let keyword = self.keyword; + + Box::pin(async move { + Ok(docs.iter().map(|doc| if doc.contains(keyword) { 1.0 } else { 0.1 }).collect()) + }) + } +} + +pub(in super::super) fn build_providers(rerank: R) -> Providers +where + R: RerankProvider + Send + Sync + 'static, +{ + Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(rerank), + Arc::new(SpyExtractor { + calls: Arc::new(AtomicUsize::new(0)), + payload: serde_json::json!({ "notes": [] }), + }), + ) +} diff --git a/packages/elf-service/tests/acceptance/structured_field_retrieval/support.rs b/packages/elf-service/tests/acceptance/structured_field_retrieval/support.rs index 3e28282e..2570ff47 100644 --- a/packages/elf-service/tests/acceptance/structured_field_retrieval/support.rs +++ b/packages/elf-service/tests/acceptance/structured_field_retrieval/support.rs @@ -1,360 +1,12 @@ -use std::{ - collections::HashMap, - sync::{Arc, atomic::AtomicUsize}, +mod context; +mod inserts; +mod qdrant; + +pub(crate) use self::{ + context::{TestContext, setup_context}, + inserts::{ + insert_chunk, insert_chunk_embedding, insert_fact_field_embedding, insert_fact_field_row, + insert_note, + }, + qdrant::{UpsertPointArgs, upsert_point}, }; - -use qdrant_client::{ - Payload, - qdrant::{Document, PointStruct, UpsertPointsBuilder, Vector}, -}; -use serde_json::Value; -use sqlx::PgExecutor; -use time::OffsetDateTime; -use uuid::Uuid; - -use crate::acceptance::{self, SpyExtractor, StubEmbedding}; -use elf_config::ProviderConfig; -use elf_service::{BoxFuture, ElfService, Providers, RerankProvider, Result}; -use elf_storage::qdrant::{BM25_MODEL, BM25_VECTOR_NAME, DENSE_VECTOR_NAME}; -use elf_testkit::TestDatabase; - -pub(crate) struct TestContext { - pub(crate) service: ElfService, - pub(crate) test_db: TestDatabase, - pub(crate) embedding_version: String, -} - -pub(crate) struct UpsertPointArgs<'a> { - pub(crate) chunk_id: Uuid, - pub(crate) note_id: Uuid, - pub(crate) chunk_index: i32, - pub(crate) start_offset: i32, - pub(crate) end_offset: i32, - pub(crate) text: &'a str, - pub(crate) dense: Vec, -} - -struct KeywordRerank { - keyword: &'static str, -} -impl RerankProvider for KeywordRerank { - fn rerank<'a>( - &'a self, - _cfg: &'a ProviderConfig, - _query: &'a str, - docs: &'a [String], - ) -> BoxFuture<'a, Result>> { - let keyword = self.keyword; - - Box::pin(async move { - Ok(docs.iter().map(|doc| if doc.contains(keyword) { 1.0 } else { 0.1 }).collect()) - }) - } -} - -pub(crate) async fn setup_context(test_name: &str) -> Option { - let Some(test_db) = acceptance::test_db().await else { - eprintln!("Skipping {test_name}; set ELF_PG_DSN to run this test."); - - return None; - }; - let Some(qdrant_url) = acceptance::test_qdrant_url() else { - eprintln!("Skipping {test_name}; set ELF_QDRANT_URL to run this test."); - - return None; - }; - let providers = Providers::new( - Arc::new(StubEmbedding { vector_dim: 4_096 }), - Arc::new(KeywordRerank { keyword: "ZEBRA" }), - Arc::new(SpyExtractor { - calls: Arc::new(AtomicUsize::new(0)), - payload: serde_json::json!({ "notes": [] }), - }), - ); - let collection = test_db.collection_name("elf_acceptance"); - let docs_collection = test_db.collection_name("elf_acceptance_docs"); - let cfg = acceptance::test_config( - test_db.dsn().to_string(), - qdrant_url, - 4_096, - collection, - docs_collection, - ); - let service = - acceptance::build_service(cfg, providers).await.expect("Failed to build service."); - - acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); - acceptance::reset_qdrant_collection( - &service.qdrant.client, - &service.qdrant.collection, - service.qdrant.vector_dim, - ) - .await - .expect("Failed to reset Qdrant collection."); - - let embedding_version = format!( - "{}:{}:{}", - service.cfg.providers.embedding.provider_id, - service.cfg.providers.embedding.model, - service.cfg.storage.qdrant.vector_dim - ); - - Some(TestContext { service, test_db, embedding_version }) -} - -pub(crate) async fn insert_note<'e, E>( - executor: E, - note_id: Uuid, - note_text: &str, - embedding_version: &str, -) where - E: PgExecutor<'e>, -{ - let now = OffsetDateTime::now_utc(); - - sqlx::query( - "\ -INSERT INTO memory_notes ( - note_id, - tenant_id, - project_id, - agent_id, - scope, - type, - key, - text, - importance, - confidence, - status, - created_at, - updated_at, - expires_at, - embedding_version, - source_ref, - hit_count, - last_hit_at -) -VALUES ( - $1, - $2, - $3, - $4, - $5, - $6, - $7, - $8, - $9, - $10, - $11, - $12, - $13, - $14, - $15, - $16, - $17, - $18 -)", - ) - .bind(note_id) - .bind("t") - .bind("p") - .bind("a") - .bind("agent_private") - .bind("fact") - .bind(Option::::None) - .bind(note_text) - .bind(0.4_f32) - .bind(0.9_f32) - .bind("active") - .bind(now) - .bind(now) - .bind(Option::::None) - .bind(embedding_version) - .bind(serde_json::json!({})) - .bind(0_i64) - .bind(Option::::None) - .execute(executor) - .await - .expect("Failed to insert memory note."); -} - -#[allow(clippy::too_many_arguments)] -pub(crate) async fn insert_chunk<'e, E>( - executor: E, - chunk_id: Uuid, - note_id: Uuid, - chunk_index: i32, - start_offset: i32, - end_offset: i32, - text: &str, - embedding_version: &str, -) where - E: PgExecutor<'e>, -{ - sqlx::query( - "\ -INSERT INTO memory_note_chunks ( - chunk_id, - note_id, - chunk_index, - start_offset, - end_offset, - text, - embedding_version -) -VALUES ($1, $2, $3, $4, $5, $6, $7)", - ) - .bind(chunk_id) - .bind(note_id) - .bind(chunk_index) - .bind(start_offset) - .bind(end_offset) - .bind(text) - .bind(embedding_version) - .execute(executor) - .await - .expect("Failed to insert chunk metadata."); -} - -pub(crate) async fn insert_chunk_embedding<'e, E>( - executor: E, - chunk_id: Uuid, - embedding_version: &str, -) where - E: PgExecutor<'e>, -{ - let vec_text = vec_text_zeros(); - - sqlx::query( - "\ -INSERT INTO note_chunk_embeddings (chunk_id, embedding_version, embedding_dim, vec) -VALUES ($1, $2, $3, $4::text::vector)", - ) - .bind(chunk_id) - .bind(embedding_version) - .bind(4_096_i32) - .bind(vec_text.as_str()) - .execute(executor) - .await - .expect("Failed to insert chunk embedding."); -} - -pub(crate) async fn insert_fact_field_row<'e, E>( - executor: E, - field_id: Uuid, - note_id: Uuid, - fact_text: &str, -) where - E: PgExecutor<'e>, -{ - sqlx::query( - "\ -INSERT INTO memory_note_fields (field_id, note_id, field_kind, item_index, text) -VALUES ($1, $2, $3, $4, $5)", - ) - .bind(field_id) - .bind(note_id) - .bind("fact") - .bind(0_i32) - .bind(fact_text) - .execute(executor) - .await - .expect("Failed to insert note field."); -} - -pub(crate) async fn insert_fact_field_embedding<'e, E>( - executor: E, - field_id: Uuid, - embedding_version: &str, -) where - E: PgExecutor<'e>, -{ - let vec_text = vec_text_zeros(); - - sqlx::query( - "\ -INSERT INTO note_field_embeddings (field_id, embedding_version, embedding_dim, vec) -VALUES ($1, $2, $3, $4::text::vector)", - ) - .bind(field_id) - .bind(embedding_version) - .bind(4_096_i32) - .bind(vec_text.as_str()) - .execute(executor) - .await - .expect("Failed to insert field embedding."); -} - -pub(crate) async fn upsert_point(service: &ElfService, args: UpsertPointArgs<'_>) { - let payload = build_payload( - args.note_id, - args.chunk_id, - args.chunk_index, - args.start_offset, - args.end_offset, - ); - let vectors = build_vectors(args.text, args.dense); - let point = PointStruct::new(args.chunk_id.to_string(), vectors, payload); - - service - .qdrant - .client - .upsert_points( - UpsertPointsBuilder::new(service.qdrant.collection.clone(), vec![point]).wait(true), - ) - .await - .expect("Failed to upsert Qdrant point."); -} - -fn vec_text_zeros() -> String { - let mut buf = String::with_capacity(2 + (4_096 * 2)); - - buf.push('['); - - for i in 0..4_096 { - if i > 0 { - buf.push(','); - } - - buf.push('0'); - } - - buf.push(']'); - - buf -} - -fn build_payload( - note_id: Uuid, - chunk_id: Uuid, - chunk_index: i32, - start_offset: i32, - end_offset: i32, -) -> Payload { - let mut payload = Payload::new(); - - payload.insert("note_id", note_id.to_string()); - payload.insert("chunk_id", chunk_id.to_string()); - payload.insert("chunk_index", Value::from(chunk_index)); - payload.insert("start_offset", Value::from(start_offset)); - payload.insert("end_offset", Value::from(end_offset)); - payload.insert("tenant_id", "t"); - payload.insert("project_id", "p"); - payload.insert("agent_id", "a"); - payload.insert("scope", "agent_private"); - payload.insert("status", "active"); - - payload -} - -fn build_vectors(text: &str, dense: Vec) -> HashMap { - let mut vectors = HashMap::new(); - - vectors.insert(DENSE_VECTOR_NAME.to_string(), Vector::from(dense)); - vectors.insert( - BM25_VECTOR_NAME.to_string(), - Vector::from(Document::new(text.to_string(), BM25_MODEL)), - ); - - vectors -} diff --git a/packages/elf-service/tests/acceptance/structured_field_retrieval/support/context.rs b/packages/elf-service/tests/acceptance/structured_field_retrieval/support/context.rs new file mode 100644 index 00000000..e3c20807 --- /dev/null +++ b/packages/elf-service/tests/acceptance/structured_field_retrieval/support/context.rs @@ -0,0 +1,80 @@ +use std::sync::{Arc, atomic::AtomicUsize}; + +use crate::acceptance::{self, SpyExtractor, StubEmbedding}; +use elf_config::ProviderConfig; +use elf_service::{BoxFuture, ElfService, Providers, RerankProvider, Result}; +use elf_testkit::TestDatabase; + +pub(crate) struct TestContext { + pub(crate) service: ElfService, + pub(crate) test_db: TestDatabase, + pub(crate) embedding_version: String, +} + +struct KeywordRerank { + keyword: &'static str, +} +impl RerankProvider for KeywordRerank { + fn rerank<'a>( + &'a self, + _cfg: &'a ProviderConfig, + _query: &'a str, + docs: &'a [String], + ) -> BoxFuture<'a, Result>> { + let keyword = self.keyword; + + Box::pin(async move { + Ok(docs.iter().map(|doc| if doc.contains(keyword) { 1.0 } else { 0.1 }).collect()) + }) + } +} + +pub(crate) async fn setup_context(test_name: &str) -> Option { + let Some(test_db) = acceptance::test_db().await else { + eprintln!("Skipping {test_name}; set ELF_PG_DSN to run this test."); + + return None; + }; + let Some(qdrant_url) = acceptance::test_qdrant_url() else { + eprintln!("Skipping {test_name}; set ELF_QDRANT_URL to run this test."); + + return None; + }; + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(KeywordRerank { keyword: "ZEBRA" }), + Arc::new(SpyExtractor { + calls: Arc::new(AtomicUsize::new(0)), + payload: serde_json::json!({ "notes": [] }), + }), + ); + let collection = test_db.collection_name("elf_acceptance"); + let docs_collection = test_db.collection_name("elf_acceptance_docs"); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + qdrant_url, + 4_096, + collection, + docs_collection, + ); + let service = + acceptance::build_service(cfg, providers).await.expect("Failed to build service."); + + acceptance::reset_db(&service.db.pool).await.expect("Failed to reset test database."); + acceptance::reset_qdrant_collection( + &service.qdrant.client, + &service.qdrant.collection, + service.qdrant.vector_dim, + ) + .await + .expect("Failed to reset Qdrant collection."); + + let embedding_version = format!( + "{}:{}:{}", + service.cfg.providers.embedding.provider_id, + service.cfg.providers.embedding.model, + service.cfg.storage.qdrant.vector_dim + ); + + Some(TestContext { service, test_db, embedding_version }) +} diff --git a/packages/elf-service/tests/acceptance/structured_field_retrieval/support/inserts.rs b/packages/elf-service/tests/acceptance/structured_field_retrieval/support/inserts.rs new file mode 100644 index 00000000..120960cb --- /dev/null +++ b/packages/elf-service/tests/acceptance/structured_field_retrieval/support/inserts.rs @@ -0,0 +1,204 @@ +use sqlx::PgExecutor; +use time::OffsetDateTime; +use uuid::Uuid; + +pub(crate) async fn insert_note<'e, E>( + executor: E, + note_id: Uuid, + note_text: &str, + embedding_version: &str, +) where + E: PgExecutor<'e>, +{ + let now = OffsetDateTime::now_utc(); + + sqlx::query( + "\ +INSERT INTO memory_notes ( + note_id, + tenant_id, + project_id, + agent_id, + scope, + type, + key, + text, + importance, + confidence, + status, + created_at, + updated_at, + expires_at, + embedding_version, + source_ref, + hit_count, + last_hit_at +) +VALUES ( + $1, + $2, + $3, + $4, + $5, + $6, + $7, + $8, + $9, + $10, + $11, + $12, + $13, + $14, + $15, + $16, + $17, + $18 +)", + ) + .bind(note_id) + .bind("t") + .bind("p") + .bind("a") + .bind("agent_private") + .bind("fact") + .bind(Option::::None) + .bind(note_text) + .bind(0.4_f32) + .bind(0.9_f32) + .bind("active") + .bind(now) + .bind(now) + .bind(Option::::None) + .bind(embedding_version) + .bind(serde_json::json!({})) + .bind(0_i64) + .bind(Option::::None) + .execute(executor) + .await + .expect("Failed to insert memory note."); +} + +#[allow(clippy::too_many_arguments)] +pub(crate) async fn insert_chunk<'e, E>( + executor: E, + chunk_id: Uuid, + note_id: Uuid, + chunk_index: i32, + start_offset: i32, + end_offset: i32, + text: &str, + embedding_version: &str, +) where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO memory_note_chunks ( + chunk_id, + note_id, + chunk_index, + start_offset, + end_offset, + text, + embedding_version +) +VALUES ($1, $2, $3, $4, $5, $6, $7)", + ) + .bind(chunk_id) + .bind(note_id) + .bind(chunk_index) + .bind(start_offset) + .bind(end_offset) + .bind(text) + .bind(embedding_version) + .execute(executor) + .await + .expect("Failed to insert chunk metadata."); +} + +pub(crate) async fn insert_chunk_embedding<'e, E>( + executor: E, + chunk_id: Uuid, + embedding_version: &str, +) where + E: PgExecutor<'e>, +{ + let vec_text = vec_text_zeros(); + + sqlx::query( + "\ +INSERT INTO note_chunk_embeddings (chunk_id, embedding_version, embedding_dim, vec) +VALUES ($1, $2, $3, $4::text::vector)", + ) + .bind(chunk_id) + .bind(embedding_version) + .bind(4_096_i32) + .bind(vec_text.as_str()) + .execute(executor) + .await + .expect("Failed to insert chunk embedding."); +} + +pub(crate) async fn insert_fact_field_row<'e, E>( + executor: E, + field_id: Uuid, + note_id: Uuid, + fact_text: &str, +) where + E: PgExecutor<'e>, +{ + sqlx::query( + "\ +INSERT INTO memory_note_fields (field_id, note_id, field_kind, item_index, text) +VALUES ($1, $2, $3, $4, $5)", + ) + .bind(field_id) + .bind(note_id) + .bind("fact") + .bind(0_i32) + .bind(fact_text) + .execute(executor) + .await + .expect("Failed to insert note field."); +} + +pub(crate) async fn insert_fact_field_embedding<'e, E>( + executor: E, + field_id: Uuid, + embedding_version: &str, +) where + E: PgExecutor<'e>, +{ + let vec_text = vec_text_zeros(); + + sqlx::query( + "\ +INSERT INTO note_field_embeddings (field_id, embedding_version, embedding_dim, vec) +VALUES ($1, $2, $3, $4::text::vector)", + ) + .bind(field_id) + .bind(embedding_version) + .bind(4_096_i32) + .bind(vec_text.as_str()) + .execute(executor) + .await + .expect("Failed to insert field embedding."); +} + +fn vec_text_zeros() -> String { + let mut buf = String::with_capacity(2 + (4_096 * 2)); + + buf.push('['); + + for i in 0..4_096 { + if i > 0 { + buf.push(','); + } + + buf.push('0'); + } + + buf.push(']'); + + buf +} diff --git a/packages/elf-service/tests/acceptance/structured_field_retrieval/support/qdrant.rs b/packages/elf-service/tests/acceptance/structured_field_retrieval/support/qdrant.rs new file mode 100644 index 00000000..2db2c5bd --- /dev/null +++ b/packages/elf-service/tests/acceptance/structured_field_retrieval/support/qdrant.rs @@ -0,0 +1,77 @@ +use std::collections::HashMap; + +use qdrant_client::{ + Payload, + qdrant::{Document, PointStruct, UpsertPointsBuilder, Vector}, +}; +use serde_json::Value; +use uuid::Uuid; + +use elf_service::ElfService; +use elf_storage::qdrant::{BM25_MODEL, BM25_VECTOR_NAME, DENSE_VECTOR_NAME}; + +pub(crate) struct UpsertPointArgs<'a> { + pub(crate) chunk_id: Uuid, + pub(crate) note_id: Uuid, + pub(crate) chunk_index: i32, + pub(crate) start_offset: i32, + pub(crate) end_offset: i32, + pub(crate) text: &'a str, + pub(crate) dense: Vec, +} + +pub(crate) async fn upsert_point(service: &ElfService, args: UpsertPointArgs<'_>) { + let payload = build_payload( + args.note_id, + args.chunk_id, + args.chunk_index, + args.start_offset, + args.end_offset, + ); + let vectors = build_vectors(args.text, args.dense); + let point = PointStruct::new(args.chunk_id.to_string(), vectors, payload); + + service + .qdrant + .client + .upsert_points( + UpsertPointsBuilder::new(service.qdrant.collection.clone(), vec![point]).wait(true), + ) + .await + .expect("Failed to upsert Qdrant point."); +} + +fn build_payload( + note_id: Uuid, + chunk_id: Uuid, + chunk_index: i32, + start_offset: i32, + end_offset: i32, +) -> Payload { + let mut payload = Payload::new(); + + payload.insert("note_id", note_id.to_string()); + payload.insert("chunk_id", chunk_id.to_string()); + payload.insert("chunk_index", Value::from(chunk_index)); + payload.insert("start_offset", Value::from(start_offset)); + payload.insert("end_offset", Value::from(end_offset)); + payload.insert("tenant_id", "t"); + payload.insert("project_id", "p"); + payload.insert("agent_id", "a"); + payload.insert("scope", "agent_private"); + payload.insert("status", "active"); + + payload +} + +fn build_vectors(text: &str, dense: Vec) -> HashMap { + let mut vectors = HashMap::new(); + + vectors.insert(DENSE_VECTOR_NAME.to_string(), Vector::from(dense)); + vectors.insert( + BM25_VECTOR_NAME.to_string(), + Vector::from(Document::new(text.to_string(), BM25_MODEL)), + ); + + vectors +} diff --git a/packages/elf-service/tests/acceptance/work_journal.rs b/packages/elf-service/tests/acceptance/work_journal.rs index 16d39c61..6f0ab5ec 100644 --- a/packages/elf-service/tests/acceptance/work_journal.rs +++ b/packages/elf-service/tests/acceptance/work_journal.rs @@ -1,379 +1,3 @@ -use std::sync::{Arc, atomic::AtomicUsize}; - -use serde_json::Value; -use time::OffsetDateTime; -use uuid::Uuid; - -use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank}; -use elf_domain::writegate::{WritePolicy, WriteRedaction, WriteSpan}; -use elf_service::{ - ElfService, Error, Providers, WorkJournalEntryCreateRequest, WorkJournalEntryFamily, - WorkJournalEntryGetRequest, WorkJournalSessionReadbackRequest, -}; -use elf_storage::{db::Db, qdrant::QdrantStore}; -use elf_testkit::TestDatabase; - -fn work_journal_entry_request(entry_id: Uuid) -> WorkJournalEntryCreateRequest { - WorkJournalEntryCreateRequest { - tenant_id: "tenant".to_string(), - project_id: "project".to_string(), - agent_id: "agent-a".to_string(), - entry_id: Some(entry_id), - scope: "agent_private".to_string(), - session_id: "xy-1117-session".to_string(), - family: WorkJournalEntryFamily::SessionLog, - title: Some("XY-1117 session log".to_string()), - body: "Work stopped after the dry run failed with api_key=abcdef123.".to_string(), - source_refs: vec![serde_json::json!({ - "schema": "source_ref/v1", - "resolver": "work_journal_test/v1", - "ref": { - "issue": "XY-1117", - "session_id": "xy-1117-session" - } - })], - write_policy: Some(WritePolicy { - exclusions: vec![], - redactions: vec![WriteRedaction::Replace { - span: WriteSpan { start: 43, end: 60 }, - replacement: "[redacted credential]".to_string(), - }], - }), - explicit_next_steps: vec!["Run the Work Journal validation tests.".to_string()], - inferred_next_steps: vec![ - "Keep journal evidence separate from current memory answers.".to_string(), - ], - rejected_options: vec![ - "Do not store this session log as an authoritative memory note.".to_string(), - ], - promotion_boundary: serde_json::json!({ "authoritative_memory_allowed": true }), - } -} - -fn request_with_promotion_boundary( - entry_id: Uuid, - promotion_boundary: Value, -) -> WorkJournalEntryCreateRequest { - let mut request = work_journal_entry_request(entry_id); - - request.body = "Work stopped after accepted promotion evidence was reviewed.".to_string(); - request.write_policy = None; - request.promotion_boundary = promotion_boundary; - - request -} - -fn memory_record_ref(note_id: Uuid) -> Value { - serde_json::json!({ - "schema": "elf.memory_record_ref/v1", - "kind": "note", - "id": note_id, - "status": "active" - }) -} - -fn dreaming_review_ref(proposal_id: Uuid, review_state: &str) -> Value { - serde_json::json!({ - "schema": "elf.dreaming_review_queue/v1", - "proposal_id": proposal_id, - "review_state": review_state - }) -} - -async fn work_journal_service() -> Option<(ElfService, TestDatabase)> { - let Some(dsn) = elf_testkit::env_dsn() else { - eprintln!("Skipping work_journal acceptance; set ELF_PG_DSN to run this test."); - - return None; - }; - let test_db = TestDatabase::new(&dsn).await.expect("Failed to create test database."); - let cfg = acceptance::test_config( - test_db.dsn().to_string(), - "http://127.0.0.1:1".to_string(), - 4_096, - test_db.collection_name("elf_acceptance_notes"), - test_db.collection_name("elf_acceptance_docs"), - ); - let db = Db::connect(&cfg.storage.postgres).await.expect("Failed to connect test DB."); - - db.ensure_schema(cfg.storage.qdrant.vector_dim).await.expect("Failed to ensure schema"); - - let qdrant = QdrantStore::new(&cfg.storage.qdrant).expect("Failed to build qdrant store"); - let providers = Providers::new( - Arc::new(StubEmbedding { vector_dim: 4_096 }), - Arc::new(StubRerank), - Arc::new(SpyExtractor { - calls: Arc::new(AtomicUsize::new(0)), - payload: serde_json::json!({}), - }), - ); - let service = ElfService::with_providers(cfg, db, qdrant, providers); - - Some((service, test_db)) -} - -#[tokio::test] -async fn work_journal_persists_redacted_source_adjacent_session_readback() { - let Some((service, test_db)) = work_journal_service().await else { - return; - }; - let entry_id = Uuid::parse_str("aaaaaaaa-1111-4111-8111-aaaaaaaa1111").expect("uuid"); - let created = service - .work_journal_entry_create(work_journal_entry_request(entry_id)) - .await - .expect("journal entry should persist"); - - assert_eq!(created.entry.entry_id, entry_id); - assert!(created.entry.body.contains("[redacted credential]")); - assert!(!created.entry.body.contains("abcdef123")); - assert_eq!( - created.entry.promotion_boundary["authoritative_memory_allowed"], - serde_json::json!(false) - ); - - let fetched = service - .work_journal_entry_get(WorkJournalEntryGetRequest { - tenant_id: "tenant".to_string(), - project_id: "project".to_string(), - agent_id: "agent-a".to_string(), - read_profile: "private_only".to_string(), - entry_id, - }) - .await - .expect("journal entry should be readable"); - - assert_eq!(fetched.entry_id, entry_id); - assert_eq!(fetched.source_refs.len(), 1); - - let readback = service - .work_journal_session_readback(WorkJournalSessionReadbackRequest { - tenant_id: "tenant".to_string(), - project_id: "project".to_string(), - agent_id: "agent-a".to_string(), - read_profile: "private_only".to_string(), - session_id: "xy-1117-session".to_string(), - families: vec![], - limit: Some(10), - }) - .await - .expect("session readback should load journal evidence"); - - assert_eq!(readback.items.len(), 1); - - let where_stopped = readback.where_stopped.expect("where_stopped should be present"); - - assert_eq!(where_stopped.latest_entry_id, entry_id); - assert_eq!( - where_stopped.explicit_next_steps, - vec!["Run the Work Journal validation tests.".to_string()] - ); - assert_eq!( - where_stopped.promotion_boundary["authoritative_memory_allowed"], - serde_json::json!(false) - ); - - let memory_count: i64 = sqlx::query_scalar("SELECT count(*) FROM memory_notes") - .fetch_one(&service.db.pool) - .await - .expect("memory_notes count should query"); - let outbox_count: i64 = sqlx::query_scalar("SELECT count(*) FROM indexing_outbox") - .fetch_one(&service.db.pool) - .await - .expect("indexing_outbox count should query"); - - assert_eq!(memory_count, 0, "Work Journal must not create authoritative memory notes"); - assert_eq!(outbox_count, 0, "Work Journal must not enqueue memory indexing"); - - test_db.cleanup().await.expect("Failed to cleanup test database."); -} - -#[tokio::test] -async fn work_journal_promotion_boundary_requires_existing_accepted_refs() { - let Some((service, test_db)) = work_journal_service().await else { - return; - }; - let forged_note_id = Uuid::parse_str("bbbbbbbb-1111-4111-8111-bbbbbbbb1111").expect("uuid"); - let forged_note_request = request_with_promotion_boundary( - Uuid::parse_str("bbbbbbbb-2222-4222-8222-bbbbbbbb2222").expect("uuid"), - serde_json::json!({ - "accepted_memory_authority_ref": memory_record_ref(forged_note_id), - }), - ); - let forged_note_error = service - .work_journal_entry_create(forged_note_request) - .await - .expect_err("syntactically valid but nonexistent memory authority ref should be rejected"); - - assert!(matches!( - forged_note_error, - Error::InvalidRequest { message } if message.contains("accepted_memory_authority_ref") - )); - - let accepted_note_id = Uuid::parse_str("cccccccc-1111-4111-8111-cccccccc1111").expect("uuid"); - - insert_active_memory_note(&service, accepted_note_id).await; - - let accepted_note_request = request_with_promotion_boundary( - Uuid::parse_str("cccccccc-2222-4222-8222-cccccccc2222").expect("uuid"), - serde_json::json!({ - "accepted_memory_authority_ref": memory_record_ref(accepted_note_id), - }), - ); - let accepted_note = service - .work_journal_entry_create(accepted_note_request) - .await - .expect("existing active memory authority ref should be accepted"); - - assert_eq!( - accepted_note.entry.promotion_boundary["authoritative_memory_allowed"], - serde_json::json!(true) - ); - - let forged_proposal_id = Uuid::parse_str("dddddddd-1111-4111-8111-dddddddd1111").expect("uuid"); - let forged_proposal_request = request_with_promotion_boundary( - Uuid::parse_str("dddddddd-2222-4222-8222-dddddddd2222").expect("uuid"), - serde_json::json!({ - "accepted_dreaming_review_ref": dreaming_review_ref(forged_proposal_id, "applied"), - }), - ); - let forged_proposal_error = service - .work_journal_entry_create(forged_proposal_request) - .await - .expect_err("syntactically valid but nonexistent dreaming review ref should be rejected"); - - assert!(matches!( - forged_proposal_error, - Error::InvalidRequest { message } if message.contains("accepted_dreaming_review_ref") - )); - - let accepted_proposal_id = - Uuid::parse_str("eeeeeeee-1111-4111-8111-eeeeeeee1111").expect("uuid"); - - insert_applied_dreaming_proposal(&service, accepted_proposal_id).await; - - let accepted_proposal_request = request_with_promotion_boundary( - Uuid::parse_str("eeeeeeee-2222-4222-8222-eeeeeeee2222").expect("uuid"), - serde_json::json!({ - "accepted_dreaming_review_ref": dreaming_review_ref(accepted_proposal_id, "applied"), - }), - ); - let accepted_proposal = service - .work_journal_entry_create(accepted_proposal_request) - .await - .expect("existing applied dreaming review ref should be accepted"); - - assert_eq!( - accepted_proposal.entry.promotion_boundary["authoritative_memory_allowed"], - serde_json::json!(true) - ); - - test_db.cleanup().await.expect("Failed to cleanup test database."); -} - -async fn insert_active_memory_note(service: &ElfService, note_id: Uuid) { - let now = OffsetDateTime::now_utc(); - - sqlx::query( - "\ -INSERT INTO memory_notes ( - note_id, - tenant_id, - project_id, - agent_id, - scope, - type, - key, - text, - importance, - confidence, - status, - created_at, - updated_at, - expires_at, - embedding_version, - source_ref -) -VALUES ($1,'tenant','project','agent-a','agent_private','fact','accepted-memory-ref','Fact: The accepted memory note is active and readable.',0.8,0.9,'active',$2,$2,NULL,'test:embedding:4096',$3)", - ) - .bind(note_id) - .bind(now) - .bind(serde_json::json!({ "schema": "work_journal_test/v1", "kind": "accepted_memory" })) - .execute(&service.db.pool) - .await - .expect("accepted memory note should insert"); -} - -async fn insert_applied_dreaming_proposal(service: &ElfService, proposal_id: Uuid) { - let run_id = Uuid::parse_str("eeeeeeee-3333-4333-8333-eeeeeeee3333").expect("uuid"); - let now = OffsetDateTime::now_utc(); - - sqlx::query( - "\ -INSERT INTO consolidation_runs ( - run_id, - tenant_id, - project_id, - agent_id, - contract_schema, - job_kind, - status, - input_refs, - source_snapshot, - lineage, - error, - created_at, - updated_at, - completed_at -) -VALUES ($1,'tenant','project','agent-a','elf.consolidation/v1','manual','completed',$2,$3,$4,'{}'::jsonb,$5,$5,$5)", - ) - .bind(run_id) - .bind(serde_json::json!([])) - .bind(serde_json::json!({ "source_count": 0 })) - .bind(serde_json::json!({ "source": "work_journal_test" })) - .bind(now) - .execute(&service.db.pool) - .await - .expect("consolidation run should insert"); - sqlx::query( - "\ -INSERT INTO consolidation_proposals ( - proposal_id, - run_id, - tenant_id, - project_id, - agent_id, - contract_schema, - proposal_kind, - apply_intent, - review_state, - source_refs, - source_snapshot, - lineage, - diff, - confidence, - unsupported_claim_flags, - contradiction_markers, - staleness_markers, - target_ref, - proposed_payload, - reviewer_agent_id, - review_comment, - reviewed_at, - created_at, - updated_at -) -VALUES ($1,$2,'tenant','project','agent-a','elf.consolidation/v1','memory_summary','no_op','applied',$3,$4,$5,$6,0.9,'[]'::jsonb,'[]'::jsonb,'[]'::jsonb,'{}'::jsonb,$7,'agent-a','Apply reviewed Work Journal test proposal.',$8,$8,$8)", - ) - .bind(proposal_id) - .bind(run_id) - .bind(serde_json::json!([])) - .bind(serde_json::json!({ "source_count": 0 })) - .bind(serde_json::json!({ "source": "work_journal_test" })) - .bind(serde_json::json!({ "summary": "Applied proposal supports Work Journal authority." })) - .bind(serde_json::json!({ "schema": "elf.dreaming_review_queue/v1" })) - .bind(now) - .execute(&service.db.pool) - .await - .expect("consolidation proposal should insert"); -} +mod helpers; +mod promotion_boundary; +mod session_readback; diff --git a/packages/elf-service/tests/acceptance/work_journal/helpers.rs b/packages/elf-service/tests/acceptance/work_journal/helpers.rs new file mode 100644 index 00000000..a612cde2 --- /dev/null +++ b/packages/elf-service/tests/acceptance/work_journal/helpers.rs @@ -0,0 +1,218 @@ +use std::sync::{Arc, atomic::AtomicUsize}; + +use serde_json::Value; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::acceptance::{self, SpyExtractor, StubEmbedding, StubRerank}; +use elf_domain::writegate::{WritePolicy, WriteRedaction, WriteSpan}; +use elf_service::{ElfService, Providers, WorkJournalEntryCreateRequest, WorkJournalEntryFamily}; +use elf_storage::{db::Db, qdrant::QdrantStore}; +use elf_testkit::TestDatabase; + +pub(super) fn work_journal_entry_request(entry_id: Uuid) -> WorkJournalEntryCreateRequest { + WorkJournalEntryCreateRequest { + tenant_id: "tenant".to_string(), + project_id: "project".to_string(), + agent_id: "agent-a".to_string(), + entry_id: Some(entry_id), + scope: "agent_private".to_string(), + session_id: "xy-1117-session".to_string(), + family: WorkJournalEntryFamily::SessionLog, + title: Some("XY-1117 session log".to_string()), + body: "Work stopped after the dry run failed with api_key=abcdef123.".to_string(), + source_refs: vec![serde_json::json!({ + "schema": "source_ref/v1", + "resolver": "work_journal_test/v1", + "ref": { + "issue": "XY-1117", + "session_id": "xy-1117-session" + } + })], + write_policy: Some(WritePolicy { + exclusions: vec![], + redactions: vec![WriteRedaction::Replace { + span: WriteSpan { start: 43, end: 60 }, + replacement: "[redacted credential]".to_string(), + }], + }), + explicit_next_steps: vec!["Run the Work Journal validation tests.".to_string()], + inferred_next_steps: vec![ + "Keep journal evidence separate from current memory answers.".to_string(), + ], + rejected_options: vec![ + "Do not store this session log as an authoritative memory note.".to_string(), + ], + promotion_boundary: serde_json::json!({ "authoritative_memory_allowed": true }), + } +} + +pub(super) fn request_with_promotion_boundary( + entry_id: Uuid, + promotion_boundary: Value, +) -> WorkJournalEntryCreateRequest { + let mut request = work_journal_entry_request(entry_id); + + request.body = "Work stopped after accepted promotion evidence was reviewed.".to_string(); + request.write_policy = None; + request.promotion_boundary = promotion_boundary; + + request +} + +pub(super) fn memory_record_ref(note_id: Uuid) -> Value { + serde_json::json!({ + "schema": "elf.memory_record_ref/v1", + "kind": "note", + "id": note_id, + "status": "active" + }) +} + +pub(super) fn dreaming_review_ref(proposal_id: Uuid, review_state: &str) -> Value { + serde_json::json!({ + "schema": "elf.dreaming_review_queue/v1", + "proposal_id": proposal_id, + "review_state": review_state + }) +} + +pub(super) async fn work_journal_service() -> Option<(ElfService, TestDatabase)> { + let Some(dsn) = elf_testkit::env_dsn() else { + eprintln!("Skipping work_journal acceptance; set ELF_PG_DSN to run this test."); + + return None; + }; + let test_db = TestDatabase::new(&dsn).await.expect("Failed to create test database."); + let cfg = acceptance::test_config( + test_db.dsn().to_string(), + "http://127.0.0.1:1".to_string(), + 4_096, + test_db.collection_name("elf_acceptance_notes"), + test_db.collection_name("elf_acceptance_docs"), + ); + let db = Db::connect(&cfg.storage.postgres).await.expect("Failed to connect test DB."); + + db.ensure_schema(cfg.storage.qdrant.vector_dim).await.expect("Failed to ensure schema"); + + let qdrant = QdrantStore::new(&cfg.storage.qdrant).expect("Failed to build qdrant store"); + let providers = Providers::new( + Arc::new(StubEmbedding { vector_dim: 4_096 }), + Arc::new(StubRerank), + Arc::new(SpyExtractor { + calls: Arc::new(AtomicUsize::new(0)), + payload: serde_json::json!({}), + }), + ); + let service = ElfService::with_providers(cfg, db, qdrant, providers); + + Some((service, test_db)) +} + +pub(super) async fn insert_active_memory_note(service: &ElfService, note_id: Uuid) { + let now = OffsetDateTime::now_utc(); + + sqlx::query( + "\ +INSERT INTO memory_notes ( + note_id, + tenant_id, + project_id, + agent_id, + scope, + type, + key, + text, + importance, + confidence, + status, + created_at, + updated_at, + expires_at, + embedding_version, + source_ref +) +VALUES ($1,'tenant','project','agent-a','agent_private','fact','accepted-memory-ref','Fact: The accepted memory note is active and readable.',0.8,0.9,'active',$2,$2,NULL,'test:embedding:4096',$3)", + ) + .bind(note_id) + .bind(now) + .bind(serde_json::json!({ "schema": "work_journal_test/v1", "kind": "accepted_memory" })) + .execute(&service.db.pool) + .await + .expect("accepted memory note should insert"); +} + +pub(super) async fn insert_applied_dreaming_proposal(service: &ElfService, proposal_id: Uuid) { + let run_id = Uuid::parse_str("eeeeeeee-3333-4333-8333-eeeeeeee3333").expect("uuid"); + let now = OffsetDateTime::now_utc(); + + sqlx::query( + "\ +INSERT INTO consolidation_runs ( + run_id, + tenant_id, + project_id, + agent_id, + contract_schema, + job_kind, + status, + input_refs, + source_snapshot, + lineage, + error, + created_at, + updated_at, + completed_at +) +VALUES ($1,'tenant','project','agent-a','elf.consolidation/v1','manual','completed',$2,$3,$4,'{}'::jsonb,$5,$5,$5)", + ) + .bind(run_id) + .bind(serde_json::json!([])) + .bind(serde_json::json!({ "source_count": 0 })) + .bind(serde_json::json!({ "source": "work_journal_test" })) + .bind(now) + .execute(&service.db.pool) + .await + .expect("consolidation run should insert"); + sqlx::query( + "\ +INSERT INTO consolidation_proposals ( + proposal_id, + run_id, + tenant_id, + project_id, + agent_id, + contract_schema, + proposal_kind, + apply_intent, + review_state, + source_refs, + source_snapshot, + lineage, + diff, + confidence, + unsupported_claim_flags, + contradiction_markers, + staleness_markers, + target_ref, + proposed_payload, + reviewer_agent_id, + review_comment, + reviewed_at, + created_at, + updated_at +) +VALUES ($1,$2,'tenant','project','agent-a','elf.consolidation/v1','memory_summary','no_op','applied',$3,$4,$5,$6,0.9,'[]'::jsonb,'[]'::jsonb,'[]'::jsonb,'{}'::jsonb,$7,'agent-a','Apply reviewed Work Journal test proposal.',$8,$8,$8)", + ) + .bind(proposal_id) + .bind(run_id) + .bind(serde_json::json!([])) + .bind(serde_json::json!({ "source_count": 0 })) + .bind(serde_json::json!({ "source": "work_journal_test" })) + .bind(serde_json::json!({ "summary": "Applied proposal supports Work Journal authority." })) + .bind(serde_json::json!({ "schema": "elf.dreaming_review_queue/v1" })) + .bind(now) + .execute(&service.db.pool) + .await + .expect("consolidation proposal should insert"); +} diff --git a/packages/elf-service/tests/acceptance/work_journal/promotion_boundary.rs b/packages/elf-service/tests/acceptance/work_journal/promotion_boundary.rs new file mode 100644 index 00000000..d51c303f --- /dev/null +++ b/packages/elf-service/tests/acceptance/work_journal/promotion_boundary.rs @@ -0,0 +1,93 @@ +use uuid::Uuid; + +use crate::acceptance::work_journal::helpers; +use elf_service::Error; + +#[tokio::test] +async fn work_journal_promotion_boundary_requires_existing_accepted_refs() { + let Some((service, test_db)) = helpers::work_journal_service().await else { + return; + }; + let forged_note_id = Uuid::parse_str("bbbbbbbb-1111-4111-8111-bbbbbbbb1111").expect("uuid"); + let forged_note_request = helpers::request_with_promotion_boundary( + Uuid::parse_str("bbbbbbbb-2222-4222-8222-bbbbbbbb2222").expect("uuid"), + serde_json::json!({ + "accepted_memory_authority_ref": helpers::memory_record_ref(forged_note_id), + }), + ); + let forged_note_error = service + .work_journal_entry_create(forged_note_request) + .await + .expect_err("syntactically valid but nonexistent memory authority ref should be rejected"); + + assert!(matches!( + forged_note_error, + Error::InvalidRequest { message } if message.contains("accepted_memory_authority_ref") + )); + + let accepted_note_id = Uuid::parse_str("cccccccc-1111-4111-8111-cccccccc1111").expect("uuid"); + + helpers::insert_active_memory_note(&service, accepted_note_id).await; + + let accepted_note_request = helpers::request_with_promotion_boundary( + Uuid::parse_str("cccccccc-2222-4222-8222-cccccccc2222").expect("uuid"), + serde_json::json!({ + "accepted_memory_authority_ref": helpers::memory_record_ref(accepted_note_id), + }), + ); + let accepted_note = service + .work_journal_entry_create(accepted_note_request) + .await + .expect("existing active memory authority ref should be accepted"); + + assert_eq!( + accepted_note.entry.promotion_boundary["authoritative_memory_allowed"], + serde_json::json!(true) + ); + + let forged_proposal_id = Uuid::parse_str("dddddddd-1111-4111-8111-dddddddd1111").expect("uuid"); + let forged_proposal_request = helpers::request_with_promotion_boundary( + Uuid::parse_str("dddddddd-2222-4222-8222-dddddddd2222").expect("uuid"), + serde_json::json!({ + "accepted_dreaming_review_ref": helpers::dreaming_review_ref( + forged_proposal_id, + "applied", + ), + }), + ); + let forged_proposal_error = service + .work_journal_entry_create(forged_proposal_request) + .await + .expect_err("syntactically valid but nonexistent dreaming review ref should be rejected"); + + assert!(matches!( + forged_proposal_error, + Error::InvalidRequest { message } if message.contains("accepted_dreaming_review_ref") + )); + + let accepted_proposal_id = + Uuid::parse_str("eeeeeeee-1111-4111-8111-eeeeeeee1111").expect("uuid"); + + helpers::insert_applied_dreaming_proposal(&service, accepted_proposal_id).await; + + let accepted_proposal_request = helpers::request_with_promotion_boundary( + Uuid::parse_str("eeeeeeee-2222-4222-8222-eeeeeeee2222").expect("uuid"), + serde_json::json!({ + "accepted_dreaming_review_ref": helpers::dreaming_review_ref( + accepted_proposal_id, + "applied", + ), + }), + ); + let accepted_proposal = service + .work_journal_entry_create(accepted_proposal_request) + .await + .expect("existing applied dreaming review ref should be accepted"); + + assert_eq!( + accepted_proposal.entry.promotion_boundary["authoritative_memory_allowed"], + serde_json::json!(true) + ); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +} diff --git a/packages/elf-service/tests/acceptance/work_journal/session_readback.rs b/packages/elf-service/tests/acceptance/work_journal/session_readback.rs new file mode 100644 index 00000000..01db001c --- /dev/null +++ b/packages/elf-service/tests/acceptance/work_journal/session_readback.rs @@ -0,0 +1,79 @@ +use uuid::Uuid; + +use crate::acceptance::work_journal::helpers; +use elf_service::{WorkJournalEntryGetRequest, WorkJournalSessionReadbackRequest}; + +#[tokio::test] +async fn work_journal_persists_redacted_source_adjacent_session_readback() { + let Some((service, test_db)) = helpers::work_journal_service().await else { + return; + }; + let entry_id = Uuid::parse_str("aaaaaaaa-1111-4111-8111-aaaaaaaa1111").expect("uuid"); + let created = service + .work_journal_entry_create(helpers::work_journal_entry_request(entry_id)) + .await + .expect("journal entry should persist"); + + assert_eq!(created.entry.entry_id, entry_id); + assert!(created.entry.body.contains("[redacted credential]")); + assert!(!created.entry.body.contains("abcdef123")); + assert_eq!( + created.entry.promotion_boundary["authoritative_memory_allowed"], + serde_json::json!(false) + ); + + let fetched = service + .work_journal_entry_get(WorkJournalEntryGetRequest { + tenant_id: "tenant".to_string(), + project_id: "project".to_string(), + agent_id: "agent-a".to_string(), + read_profile: "private_only".to_string(), + entry_id, + }) + .await + .expect("journal entry should be readable"); + + assert_eq!(fetched.entry_id, entry_id); + assert_eq!(fetched.source_refs.len(), 1); + + let readback = service + .work_journal_session_readback(WorkJournalSessionReadbackRequest { + tenant_id: "tenant".to_string(), + project_id: "project".to_string(), + agent_id: "agent-a".to_string(), + read_profile: "private_only".to_string(), + session_id: "xy-1117-session".to_string(), + families: vec![], + limit: Some(10), + }) + .await + .expect("session readback should load journal evidence"); + + assert_eq!(readback.items.len(), 1); + + let where_stopped = readback.where_stopped.expect("where_stopped should be present"); + + assert_eq!(where_stopped.latest_entry_id, entry_id); + assert_eq!( + where_stopped.explicit_next_steps, + vec!["Run the Work Journal validation tests.".to_string()] + ); + assert_eq!( + where_stopped.promotion_boundary["authoritative_memory_allowed"], + serde_json::json!(false) + ); + + let memory_count: i64 = sqlx::query_scalar("SELECT count(*) FROM memory_notes") + .fetch_one(&service.db.pool) + .await + .expect("memory_notes count should query"); + let outbox_count: i64 = sqlx::query_scalar("SELECT count(*) FROM indexing_outbox") + .fetch_one(&service.db.pool) + .await + .expect("indexing_outbox count should query"); + + assert_eq!(memory_count, 0, "Work Journal must not create authoritative memory notes"); + assert_eq!(outbox_count, 0, "Work Journal must not enqueue memory indexing"); + + test_db.cleanup().await.expect("Failed to cleanup test database."); +}