diff --git a/.github/workflows/web-service.yml b/.github/workflows/web-service.yml index 88357201e..1791a1c03 100644 --- a/.github/workflows/web-service.yml +++ b/.github/workflows/web-service.yml @@ -38,7 +38,8 @@ env: DENY_WARNINGS: false runtime_name: ort runtime_version: v1.15.1 - web_api_archive: web-service-api + frontoffice_archive: web-service-frontoffice + backoffice_archive: web-service-backoffice jobs: services-build: @@ -74,12 +75,21 @@ jobs: ${{ inputs.model_name }} ${{ inputs.model_version }} \ ${{ env.runtime_name }} ${{ env.runtime_version }} - - name: Create web-api artifact + - name: Create backoffice artifact uses: ./.github/actions/release_artifact with: platform: ${{ inputs.platform }} - bin_name: "web-api" - archive_name: ${{ env.web_api_archive }} + bin_name: "ingestion" + archive_name: ${{ env.backoffice_archive }} + model_full_name: ${{ inputs.model_name }}_${{ inputs.model_version }} + runtime_full_name: ${{ env.runtime_name }}_${{ env.runtime_version }} + + - name: Create frontoffice artifact + uses: ./.github/actions/release_artifact + with: + platform: ${{ inputs.platform }} + bin_name: "personalization" + archive_name: ${{ env.frontoffice_archive }} model_full_name: ${{ inputs.model_name }}_${{ inputs.model_version }} runtime_full_name: ${{ env.runtime_name }}_${{ env.runtime_version }} @@ -110,20 +120,30 @@ jobs: TAG="$TAG-${{ inputs.model_name }}_${{ inputs.model_version }}" TAG="$TAG-${{ inputs.platform }}" fi - webapi_image_name="xaynetci/xayn_discovery_web_service:$TAG" - echo "webapi_image_name=$webapi_image_name" >> $GITHUB_ENV + frontoffice_image_name="xaynetci/xayn_discovery_web_service:$TAG" + backoffice_image_name="xaynetci/xayn_discovery_ingestion_service:$TAG" + echo "frontoffice_image_name=$frontoffice_image_name" >> $GITHUB_ENV + echo "backoffice_image_name=$backoffice_image_name" >> $GITHUB_ENV cat <>${GITHUB_STEP_SUMMARY} # Image names - WebApi: $webapi_image_name + Back-office: $frontoffice_image_name + Front-office: $backoffice_image_name EOT - - name: Create web API docker image + - name: Create backoffice docker image + uses: ./.github/actions/release_image + with: + archive_name: ${{ env.backoffice_archive }} + image_name: ${{ env.backoffice_image_name }} + platform: ${{ inputs.platform }} + + - name: Create frontoffice docker image uses: ./.github/actions/release_image with: - archive_name: ${{ env.web_api_archive }} - image_name: ${{ env.webapi_image_name }} + archive_name: ${{ env.frontoffice_archive }} + image_name: ${{ env.frontoffice_image_name }} platform: ${{ inputs.platform }} - name: Login to Docker Hub @@ -135,4 +155,5 @@ jobs: - name: Push images run: | set -eux - docker push ${{ env.webapi_image_name }} + docker push ${{ env.frontoffice_image_name }} + docker push ${{ env.backoffice_image_name }} diff --git a/integration-tests/src/lib.rs b/integration-tests/src/lib.rs index 3f09d327d..f7f09d6f2 100644 --- a/integration-tests/src/lib.rs +++ b/integration-tests/src/lib.rs @@ -62,7 +62,7 @@ use tracing_subscriber::{ Layer, }; use xayn_test_utils::{asset::ort_target, env::clear_env, workspace::find_workspace_dir}; -use xayn_web_api::{config, start, AppHandle, Application, WebApi}; +use xayn_web_api::{config, start, AppHandle, Application, Ingestion}; use xayn_web_api_db_ctrl::{Silo, Tenant}; use xayn_web_api_shared::{ elastic, @@ -723,7 +723,7 @@ pub fn build_test_config_from_parts_and_names( }, ); - if app_name == WebApi::NAME { + if app_name == Ingestion::NAME { let python_workspace = workspace.join("./snippet-extractor").display().to_string(); let tokenizer = model_dir.join("tokenizer.json").display().to_string(); let limit_to_two_threads = (num_cpus::get() as f32).recip() * 2.; diff --git a/integration-tests/tests/document_candidates.rs b/integration-tests/tests/document_candidates.rs index bf7ba581d..67b68614b 100644 --- a/integration-tests/tests/document_candidates.rs +++ b/integration-tests/tests/document_candidates.rs @@ -20,7 +20,7 @@ use reqwest::{Client, StatusCode, Url}; use serde::Deserialize; use serde_json::{json, Value}; use xayn_integration_tests::{send_assert, send_assert_json, test_app, UNCHANGED_CONFIG}; -use xayn_web_api::WebApi; +use xayn_web_api::Ingestion; async fn ingest(client: &Client, url: &Url) -> Result<(), Error> { send_assert( @@ -72,7 +72,7 @@ async fn set(client: &Client, url: &Url, ids: impl IntoIterator) -> #[test] fn test_candidates_all() { - test_app::(UNCHANGED_CONFIG, |client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |client, url, _| async move { assert!(get(&client, &url).await?.ids().is_empty()); ingest(&client, &url).await?; assert_eq!(get(&client, &url).await?.ids(), ["d1", "d2", "d3"].into()); @@ -84,7 +84,7 @@ fn test_candidates_all() { #[test] fn test_candidates_some() { - test_app::(UNCHANGED_CONFIG, |client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |client, url, _| async move { assert!(get(&client, &url).await?.ids().is_empty()); ingest(&client, &url).await?; assert_eq!(get(&client, &url).await?.ids(), ["d1", "d2", "d3"].into()); @@ -96,7 +96,7 @@ fn test_candidates_some() { #[test] fn test_candidates_none() { - test_app::(UNCHANGED_CONFIG, |client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |client, url, _| async move { assert!(get(&client, &url).await?.ids().is_empty()); ingest(&client, &url).await?; assert_eq!(get(&client, &url).await?.ids(), ["d1", "d2", "d3"].into()); @@ -108,7 +108,7 @@ fn test_candidates_none() { #[test] fn test_candidates_not_default() { - test_app::(UNCHANGED_CONFIG, |client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |client, url, _| async move { assert!(get(&client, &url).await?.ids().is_empty()); send_assert( &client, @@ -154,7 +154,7 @@ struct ServerError { #[test] fn test_candidates_warning() { - test_app::(UNCHANGED_CONFIG, |client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |client, url, _| async move { assert!(get(&client, &url).await?.ids().is_empty()); ingest(&client, &url).await?; assert_eq!(get(&client, &url).await?.ids(), ["d1", "d2", "d3"].into()); @@ -182,7 +182,7 @@ fn test_candidates_warning() { #[test] fn test_candidates_reingestion() { - test_app::(UNCHANGED_CONFIG, |client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |client, url, _| async move { assert!(get(&client, &url).await?.ids().is_empty()); send_assert( &client, @@ -254,7 +254,7 @@ fn test_deprecated_candidates() { Ok(()) } - test_app::(UNCHANGED_CONFIG, |client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |client, url, _| async move { deprecated_candidates(&client, &url, "/candidates").await?; deprecated_candidates(&client, &url, "/documents/candidates").await?; diff --git a/integration-tests/tests/document_properties.rs b/integration-tests/tests/document_properties.rs index f5caf840a..bf75ae1ae 100644 --- a/integration-tests/tests/document_properties.rs +++ b/integration-tests/tests/document_properties.rs @@ -18,7 +18,7 @@ use reqwest::StatusCode; use serde::Deserialize; use serde_json::{json, Value}; use xayn_integration_tests::{send_assert, send_assert_json, test_app, UNCHANGED_CONFIG}; -use xayn_web_api::WebApi; +use xayn_web_api::Ingestion; #[derive(Debug, Deserialize)] struct DocumentPropertiesResponse { @@ -33,7 +33,7 @@ enum Error { } fn document_properties(is_candidate: bool) { - test_app::(UNCHANGED_CONFIG, |client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |client, url, _| async move { send_assert( &client, client @@ -151,7 +151,7 @@ struct DocumentPropertyResponse { } fn document_property(is_candidate: bool) { - test_app::(UNCHANGED_CONFIG, |client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |client, url, _| async move { send_assert( &client, client diff --git a/integration-tests/tests/elastic.rs b/integration-tests/tests/elastic.rs index 91a0126ad..350743bf1 100644 --- a/integration-tests/tests/elastic.rs +++ b/integration-tests/tests/elastic.rs @@ -15,7 +15,7 @@ use serde_json::{json, Value}; use xayn_integration_tests::{test_app, TEST_EMBEDDING_SIZE, UNCHANGED_CONFIG}; use xayn_test_utils::assert_approx_eq; -use xayn_web_api::WebApi; +use xayn_web_api::Ingestion; use xayn_web_api_shared::{ elastic::{BulkInstruction, Error, SerdeDiscard}, serde::json_object, @@ -32,7 +32,7 @@ fn emb(emb: &[f32]) -> Result { // just to be sure that the behavior hasn't changed #[test] fn test_normalized_es_knn_scores() { - test_app::(UNCHANGED_CONFIG, |_, _, services| async move { + test_app::(UNCHANGED_CONFIG, |_, _, services| async move { let client = services .silo .elastic_client() diff --git a/integration-tests/tests/et_4957_es_deletion.rs b/integration-tests/tests/et_4957_es_deletion.rs index 391debd5a..7c241fc71 100644 --- a/integration-tests/tests/et_4957_es_deletion.rs +++ b/integration-tests/tests/et_4957_es_deletion.rs @@ -51,7 +51,7 @@ use xayn_integration_tests::{ Services, UNCHANGED_CONFIG, }; -use xayn_web_api::WebApi; +use xayn_web_api::{Ingestion, Personalization}; use xayn_web_api_shared::json_object; async fn get_candidates(client: &Client, url: &Url) -> Result, Error> { @@ -216,7 +216,7 @@ fn string_set(x: impl IntoIterator>) -> HashSet #[test] fn test_deletes_them_from_elastic_search() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, services| async move { @@ -332,7 +332,7 @@ fn test_deletes_them_from_elastic_search() { #[test] fn test_deletes_them_from_elastic_search_2() { - test_app::( + test_app::( UNCHANGED_CONFIG, |client, ingestion_url, services| async move { ingest( diff --git a/integration-tests/tests/filter.rs b/integration-tests/tests/filter.rs index 521fe0601..bb932db6d 100644 --- a/integration-tests/tests/filter.rs +++ b/integration-tests/tests/filter.rs @@ -19,7 +19,7 @@ use reqwest::{Client, StatusCode, Url}; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use xayn_integration_tests::{send_assert, send_assert_json, test_two_apps, UNCHANGED_CONFIG}; -use xayn_web_api::WebApi; +use xayn_web_api::{Ingestion, Personalization}; #[derive(Serialize)] struct IngestedDocument { @@ -93,7 +93,7 @@ struct BadRequest { #[test] fn test_filter_boolean() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -178,7 +178,7 @@ fn test_filter_boolean() { #[test] fn test_filter_keyword() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -263,7 +263,7 @@ fn test_filter_keyword() { #[test] fn test_filter_keyword_array_single() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -378,7 +378,7 @@ fn test_filter_keyword_array_single() { #[test] fn test_filter_keyword_array_multiple() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -509,7 +509,7 @@ fn test_filter_keyword_array_multiple() { #[test] fn test_filter_combine() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -677,7 +677,7 @@ fn test_filter_combine() { #[test] fn test_filter_number() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -783,7 +783,7 @@ fn test_filter_number() { #[test] fn test_filter_date() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -897,7 +897,7 @@ fn test_filter_date() { #[test] fn test_filter_no_value() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -1004,7 +1004,7 @@ fn test_filter_no_value() { #[test] fn test_deprecated_published_after() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { diff --git a/integration-tests/tests/health.rs b/integration-tests/tests/health.rs index a8f3510a7..266bd8b7b 100644 --- a/integration-tests/tests/health.rs +++ b/integration-tests/tests/health.rs @@ -16,11 +16,11 @@ use std::time::Duration; use reqwest::{Client, StatusCode}; use xayn_integration_tests::{send_assert, test_app, UNCHANGED_CONFIG}; -use xayn_web_api::WebApi; +use xayn_web_api::Ingestion; #[test] fn test_health() { - test_app::(UNCHANGED_CONFIG, |_client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |_client, url, _| async move { // make sure not to use any presets from `test_app`, like e.g. the // X-Xayn-Tenant-Id header. let client = Client::builder() diff --git a/integration-tests/tests/ingestion.rs b/integration-tests/tests/ingestion.rs index bc36d218e..ce905ec3f 100644 --- a/integration-tests/tests/ingestion.rs +++ b/integration-tests/tests/ingestion.rs @@ -27,7 +27,7 @@ use xayn_integration_tests::{ with_text_extractor_options, UNCHANGED_CONFIG, }; -use xayn_web_api::WebApi; +use xayn_web_api::{Ingestion, Personalization}; async fn ingest(client: &Client, url: &Url) -> Result<(), anyhow::Error> { send_assert( @@ -72,7 +72,7 @@ struct Error { #[test] fn test_ingestion_created() { - test_app::(UNCHANGED_CONFIG, |client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |client, url, _| async move { ingest(&client, &url).await?; send_assert( &client, @@ -103,7 +103,7 @@ fn test_ingestion_created() { #[test] fn test_ingestion_bad_request() { - test_app::(UNCHANGED_CONFIG, |client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |client, url, _| async move { let long_snippet = vec!["a"; 2049].join(""); let error = send_assert_json::( &client, @@ -217,7 +217,7 @@ fn test_ingestion_created_with_file() { .map(Into::into) .collect(); - test_app::( + test_app::( with_text_extractor_options(allowed_content_type, None), |client, url, _| async move { send_assert( @@ -270,7 +270,7 @@ fn test_ingestion_created_with_file_bad_request() { ); let txt_content = txt_content_data.clone(); - test_app::( + test_app::( with_text_extractor_options(vec![], None), |client, url, _| async move { send_assert( @@ -353,7 +353,7 @@ fn test_ingestion_created_with_file_bad_request() { ); let txt_content = txt_content_data; - test_app::( + test_app::( with_text_extractor_options(vec![], Some(1)), |client, url, _| async move { send_assert( @@ -378,7 +378,7 @@ fn test_ingestion_created_with_file_bad_request() { #[test] fn test_deletion() { - test_app::(UNCHANGED_CONFIG, |client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |client, url, _| async move { ingest(&client, &url).await?; send_assert( &client, @@ -418,7 +418,7 @@ struct SemanticSearchResponse { #[test] fn test_reingestion_candidates() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -508,7 +508,7 @@ fn test_reingestion_candidates() { // new and changed documents have been logged and manually check the databases #[test] fn test_reingestion_snippets() { - test_app::(UNCHANGED_CONFIG, |client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |client, url, _| async move { send_assert( &client, client @@ -555,7 +555,7 @@ struct OrderPropertyResponse { #[test] fn test_ingestion_same_id() { - test_app::(UNCHANGED_CONFIG, |client, url, _| async move { + test_app::(UNCHANGED_CONFIG, |client, url, _| async move { send_assert( &client, client @@ -598,7 +598,7 @@ fn test_ingestion_same_id() { #[test] fn test_ingestion_validation() { - test_app::( + test_app::( Some(toml! { [ingestion] max_snippet_size = 10 diff --git a/integration-tests/tests/legacy_tenant_migration.rs b/integration-tests/tests/legacy_tenant_migration.rs index c78732900..eebb86373 100644 --- a/integration-tests/tests/legacy_tenant_migration.rs +++ b/integration-tests/tests/legacy_tenant_migration.rs @@ -35,7 +35,7 @@ use xayn_integration_tests::{ TEST_EMBEDDING_SIZE, }; use xayn_test_utils::{asset::ort_target, env::clear_env}; -use xayn_web_api::{config, start, Application, WebApi}; +use xayn_web_api::{config, start, Application, Ingestion, Personalization}; use xayn_web_api_db_ctrl::{elastic_create_tenant, LegacyTenantInfo, Silo}; use xayn_web_api_shared::{ elastic, @@ -181,7 +181,7 @@ fn test_full_migration() { let (pg_config, es_config) = legacy_test_setup(&test_id).await?; let ingestion_config = build_test_config_from_parts_and_names( - WebApi::NAME, + Ingestion::NAME, &pg_config, &es_config, Table::new(), @@ -190,7 +190,7 @@ fn test_full_migration() { ); let personalization_config = build_test_config_from_parts_and_names( - WebApi::NAME, + Personalization::NAME, &pg_config, &es_config, Table::new(), @@ -308,7 +308,7 @@ fn test_full_migration() { let config = config::load_with_args([""; 0], { let config = build_test_config_from_parts_and_names( - WebApi::NAME, + Ingestion::NAME, &pg_config, &es_config, Table::new(), @@ -323,12 +323,12 @@ fn test_full_migration() { &format!("inline:{config}"), ] }); - let ingestion = start::(config).await?; + let ingestion = start::(config).await?; info!("started new ingestion"); let ingestion_url = ingestion.url(); let config = config::load_with_args([""; 0], { let config = build_test_config_from_parts_and_names( - WebApi::NAME, + Personalization::NAME, &pg_config, &es_config, Table::new(), @@ -343,7 +343,7 @@ fn test_full_migration() { &format!("inline:{config}"), ] }); - let personalization = start::(config).await?; + let personalization = start::(config).await?; info!("started new personalization"); let personalization_url = personalization.url(); diff --git a/integration-tests/tests/manage_indexed_properties.rs b/integration-tests/tests/manage_indexed_properties.rs index d5c16e915..e080fe6a2 100644 --- a/integration-tests/tests/manage_indexed_properties.rs +++ b/integration-tests/tests/manage_indexed_properties.rs @@ -17,11 +17,11 @@ use serde_json::{json, Value}; use toml::toml; use url::Url; use xayn_integration_tests::{send_assert, send_assert_json, test_app, UNCHANGED_CONFIG}; -use xayn_web_api::WebApi; +use xayn_web_api::Ingestion; #[test] fn test_creating_indexed_properties() { - test_app::( + test_app::( UNCHANGED_CONFIG, |client, ingestion_url, services| async move { let res = send_assert_json::( @@ -182,7 +182,7 @@ fn test_creating_indexed_properties() { #[test] fn test_check_new_property_values_against_schema() { - test_app::( + test_app::( Some(toml! { [ingestion.index_update] method = "background" diff --git a/integration-tests/tests/personalized_semantic_search.rs b/integration-tests/tests/personalized_semantic_search.rs index fef6ebd5f..9dcb3d253 100644 --- a/integration-tests/tests/personalized_semantic_search.rs +++ b/integration-tests/tests/personalized_semantic_search.rs @@ -19,7 +19,7 @@ use serde::Deserialize; use serde_json::json; use toml::toml; use xayn_integration_tests::{send_assert, send_assert_json, test_two_apps, UNCHANGED_CONFIG}; -use xayn_web_api::WebApi; +use xayn_web_api::{Ingestion, Personalization}; async fn ingest(client: &Client, ingestion_url: &Url) -> Result<(), Error> { send_assert( @@ -93,7 +93,7 @@ macro_rules! assert_order { #[test] fn test_full_personalization() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, Some(toml! { [semantic_search] @@ -170,7 +170,7 @@ fn test_full_personalization() { #[test] fn test_subtle_personalization() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, Some(toml! { [semantic_search] @@ -207,7 +207,7 @@ fn test_subtle_personalization() { #[test] fn test_full_personalization_with_inline_history() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, Some(toml! { [semantic_search] diff --git a/integration-tests/tests/recommendations.rs b/integration-tests/tests/recommendations.rs index 1ca8ec0c4..6096239ac 100644 --- a/integration-tests/tests/recommendations.rs +++ b/integration-tests/tests/recommendations.rs @@ -18,7 +18,7 @@ use reqwest::{Client, StatusCode, Url}; use serde::Deserialize; use serde_json::json; use xayn_integration_tests::{send_assert, send_assert_json, test_two_apps, UNCHANGED_CONFIG}; -use xayn_web_api::WebApi; +use xayn_web_api::{Ingestion, Personalization}; async fn ingest(client: &Client, ingestion_url: &Url) -> Result<(), Error> { send_assert( @@ -93,7 +93,7 @@ macro_rules! assert_order { #[test] fn test_full_personalization_with_inline_history() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _services| async move { @@ -125,7 +125,7 @@ fn test_full_personalization_with_inline_history() { #[test] fn test_full_personalization_with_user_id_that_does_not_exist() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _services| async move { @@ -152,7 +152,7 @@ fn test_full_personalization_with_user_id_that_does_not_exist() { #[test] fn test_full_personalization_with_user_id_that_has_two_interactions() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _services| async move { diff --git a/integration-tests/tests/semantic_search.rs b/integration-tests/tests/semantic_search.rs index 456bc6455..1c7034276 100644 --- a/integration-tests/tests/semantic_search.rs +++ b/integration-tests/tests/semantic_search.rs @@ -24,7 +24,7 @@ use xayn_integration_tests::{ with_dev_options, UNCHANGED_CONFIG, }; -use xayn_web_api::WebApi; +use xayn_web_api::{Ingestion, Personalization}; async fn ingest(client: &Client, ingestion_url: &Url) -> Result<(), Error> { send_assert( @@ -96,7 +96,7 @@ macro_rules! assert_order { #[test] fn test_semantic_search() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -141,7 +141,7 @@ fn test_semantic_search() { #[test] fn test_semantic_search_with_query() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -211,7 +211,7 @@ fn test_semantic_search_with_query() { #[test] fn test_semantic_search_with_dev_option_hybrid() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, with_dev_options(), |client, ingestion_url, personalization_url, _| async move { @@ -322,7 +322,7 @@ fn test_semantic_search_with_dev_option_hybrid() { #[test] fn test_semantic_search_with_dev_option_candidates() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, with_dev_options(), |client, ingestion_url, personalization_url, _| async move { @@ -391,7 +391,7 @@ fn test_semantic_search_with_dev_option_candidates() { #[test] fn test_semantic_search_include_snippet() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -434,7 +434,7 @@ fn test_semantic_search_include_snippet() { #[test] fn test_semantic_search_with_dev_option_raw_scores() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, with_dev_options(), |client, ingestion_url, personalization_url, _| async move { diff --git a/integration-tests/tests/silo_management_api.rs b/integration-tests/tests/silo_management_api.rs index fc157c711..25c42f72d 100644 --- a/integration-tests/tests/silo_management_api.rs +++ b/integration-tests/tests/silo_management_api.rs @@ -19,7 +19,7 @@ use serde::Deserialize; use serde_json::json; use toml::toml; use xayn_integration_tests::{send_assert_json, test_app}; -use xayn_web_api::WebApi; +use xayn_web_api::Ingestion; use xayn_web_api_db_ctrl::{OperationResult, Tenant}; use xayn_web_api_shared::request::TenantId; @@ -30,7 +30,7 @@ struct ManagementResponse { #[test] fn test_tenants_can_be_created() { - test_app::( + test_app::( Some(toml! { [tenants] enable_legacy_tenant = false diff --git a/integration-tests/tests/split_documents.rs b/integration-tests/tests/split_documents.rs index 28c25d38d..3fbe0c075 100644 --- a/integration-tests/tests/split_documents.rs +++ b/integration-tests/tests/split_documents.rs @@ -29,7 +29,7 @@ use xayn_integration_tests::{ with_dev_options, UNCHANGED_CONFIG, }; -use xayn_web_api::WebApi; +use xayn_web_api::{Ingestion, Personalization}; #[derive(Debug, Deserialize)] struct PersonalizedDocumentData { @@ -193,7 +193,7 @@ async fn ingest(client: &Client, ingestion_url: &Url) -> Result<(), Error> { #[test] fn test_split_documents_for_semantic_search() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, with_dev_options(), |client, ingestion_url, personalization_url, _| async move { @@ -232,7 +232,7 @@ fn test_split_documents_for_semantic_search() { #[test] fn test_split_documents_with_set_candidates() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, with_dev_options(), |client, ingestion_url, personalization_url, _| async move { @@ -271,7 +271,7 @@ fn test_split_documents_with_set_candidates() { #[test] fn test_split_documents_with_property_updates() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, with_dev_options(), |client, ingestion_url, personalization_url, _| async move { @@ -438,7 +438,7 @@ fn test_split_documents_with_property_updates() { #[test] fn test_split_allows_huge_snippets() { - test_app::( + test_app::( Some(toml! { [ingestion] max_snippet_size = 3 @@ -493,7 +493,7 @@ fn test_split_allows_huge_snippets() { #[test] fn test_endpoints_which_do_not_yet_fully_support_split_do_not_fall_over() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, with_dev_options(), |client, ingestion_url, personalization_url, _| async move { diff --git a/integration-tests/tests/store_user_history.rs b/integration-tests/tests/store_user_history.rs index 24d4e19a9..28b9f12a4 100644 --- a/integration-tests/tests/store_user_history.rs +++ b/integration-tests/tests/store_user_history.rs @@ -19,7 +19,7 @@ use serde::Deserialize; use serde_json::json; use toml::toml; use xayn_integration_tests::{send_assert, send_assert_json, test_two_apps, UNCHANGED_CONFIG}; -use xayn_web_api::WebApi; +use xayn_web_api::{Ingestion, Personalization}; #[derive(Deserialize)] struct PersonalizedDocumentData { @@ -32,7 +32,7 @@ struct PersonalizedDocumentsResponse { } fn store_user_history(enabled: bool) { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, Some(toml! { [personalization] diff --git a/integration-tests/tests/tenant_id_header.rs b/integration-tests/tests/tenant_id_header.rs index 195deb8cb..3554968a1 100644 --- a/integration-tests/tests/tenant_id_header.rs +++ b/integration-tests/tests/tenant_id_header.rs @@ -16,11 +16,11 @@ use reqwest::{Client, StatusCode}; use serde_json::json; use toml::toml; use xayn_integration_tests::{send_assert, test_app}; -use xayn_web_api::WebApi; +use xayn_web_api::Ingestion; #[test] fn test_tenant_id_is_required_if_legacy_tenant_is_disabled() { - test_app::( + test_app::( Some(toml! { [tenants] enable_legacy_tenant = false @@ -49,7 +49,7 @@ fn test_tenant_id_is_required_if_legacy_tenant_is_disabled() { #[test] fn test_tenant_id_is_not_required_if_legacy_tenant_is_enabled() { - test_app::( + test_app::( Some(toml! { [tenants] enable_legacy_tenant = true diff --git a/integration-tests/tests/user_recommendations.rs b/integration-tests/tests/user_recommendations.rs index e019ab6fa..ae020850b 100644 --- a/integration-tests/tests/user_recommendations.rs +++ b/integration-tests/tests/user_recommendations.rs @@ -20,7 +20,7 @@ use reqwest::{Client, Request, StatusCode, Url}; use serde::Deserialize; use serde_json::{json, Value}; use xayn_integration_tests::{send_assert, send_assert_json, test_two_apps, UNCHANGED_CONFIG}; -use xayn_web_api::WebApi; +use xayn_web_api::{Ingestion, Personalization}; use xayn_web_api_shared::serde::json_object; async fn ingest_with_dates(client: &Client, ingestion_url: &Url) -> Result<(), Error> { @@ -304,7 +304,7 @@ async fn personalize( #[test] fn test_personalization_all_dates() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -325,7 +325,7 @@ fn test_personalization_all_dates() { #[test] fn test_personalization_limited_dates() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -352,7 +352,7 @@ fn test_personalization_limited_dates() { #[test] fn test_personalization_with_tags() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -372,7 +372,7 @@ fn test_personalization_with_tags() { } fn personalization_include_properties(include_properties: bool) { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { @@ -413,7 +413,7 @@ fn test_personalization_exclude_properties() { #[test] fn test_personalize_no_body() { - test_two_apps::( + test_two_apps::( UNCHANGED_CONFIG, UNCHANGED_CONFIG, |client, ingestion_url, personalization_url, _| async move { diff --git a/web-api/compose.single-machine-deploy.yml b/web-api/compose.single-machine-deploy.yml index 2ba3698e7..d40428e58 100644 --- a/web-api/compose.single-machine-deploy.yml +++ b/web-api/compose.single-machine-deploy.yml @@ -36,8 +36,8 @@ services: timeout: 5s retries: 5 - webapi: - image: xaynetci/xayn_discovery_web_service:ET-5231-update-image-generation-231204120506-693e23c-xaynia_v0201-arm64 + backoffice: + image: xaynetci/xayn_discovery_ingestion_service:main-231019104503-d06d92c-e5_small_v2_v0001-amd64 mem_limit: 1.5g cpus: 2 restart: always @@ -63,6 +63,30 @@ services: db_ready: condition: service_completed_successfully + frontoffice: + image: xaynetci/xayn_discovery_web_service:main-231019104503-d06d92c-e5_small_v2_v0001-amd64 + mem_limit: 1.5g + cpus: 2 + restart: always + environment: + XAYN_WEB_API__STORAGE__POSTGRES__BASE_URL: "postgres://${POSTGRES_USER:-user}:${POSTGRES_PASSWORD:-pw}@postgres:5432/xayn" + XAYN_WEB_API__STORAGE__ELASTIC__URL: "http://elasticsearch:9200/" + XAYN_WEB_API__NET__BIND_TO: "0.0.0.0:8082" + XAYN_WEB_API__STORAGE__POSTGRES__USER: "${POSTGRES_USER:-user}" + XAYN_WEB_API__STORAGE__POSTGRES__PASSWORD: "${POSTGRES_PASSWORD:-pw}" + XAYN_WEB_API__STORAGE__POSTGRES__APPLICATION_NAME: "xayn" + XAYN_WEB_API__EMBEDDING__TOKEN_SIZE: "512" + XAYN_WEB_API__STORAGE__ELASTIC__USER: "elastic" + XAYN_WEB_API__EMBEDDING__TYPE: "pipeline" + XAYN_WEB_API__STORAGE__ELASTIC__INDEX_NAME: "xayn" + ports: + - "8082:8082" + networks: + - internal + depends_on: + db_ready: + condition: service_completed_successfully + tika: image: apache/tika:2.8.0.0-full mem_limit: 512m diff --git a/web-api/src/app.rs b/web-api/src/app.rs index 7f7c88ac9..83c4753e5 100644 --- a/web-api/src/app.rs +++ b/web-api/src/app.rs @@ -24,7 +24,6 @@ use tracing::{info, instrument}; pub(crate) use self::state::{AppState, TenantState}; use crate::{ - config::WebApiConfig, embedding, extractor, logging, @@ -49,6 +48,7 @@ pub trait Application: 'static { + Sync + Debug + 'static; + type Extension: Send + Sync + 'static; /// Configures the actix service(s) used by this application. /// @@ -62,6 +62,12 @@ pub trait Application: 'static { /// application middleware and will not be reachable using anything /// which uses CORS. fn configure_ops_service(_config: &mut ServiceConfig) {} + + /// Create an application specific extension to app state. + //Design Note: We could handle this by adding `TyFrom<&Config<..>>` bounds + // to `Extension` but using this helper method is simpler + // and it is also easier to add async if needed (using #[async-trait]). + fn create_extension(config: &Self::Config) -> Result; } pub type SetupError = anyhow::Error; @@ -70,7 +76,7 @@ pub type SetupError = anyhow::Error; /// /// The return value is the exit code which should be used. #[instrument(skip_all)] -pub async fn start(config: WebApiConfig) -> Result +pub async fn start(config: A::Config) -> Result where A: Application + 'static, { @@ -80,7 +86,7 @@ where info!(pwd=?pwd); let net_config = net::Config::clone(config.as_ref()); - let app_state = Arc::new(AppState::create(config).await?); + let app_state = Arc::new(AppState::::create(config).await?); let legacy_tenant = app_state.legacy_tenant().cloned(); let shutdown = Box::new({ @@ -114,11 +120,6 @@ macro_rules! application_names { } else { format!("XAYN_{name}") }; - [ - name, - "XAYN_WEB_API".to_string(), - "XAYN_PERSONALIZATION".to_string(), - "XAYN_INGESTION".to_string(), - ] + [name, "XAYN_WEB_API".to_string()] }}; } diff --git a/web-api/src/app/state.rs b/web-api/src/app/state.rs index 86f53f948..ecd77b2a0 100644 --- a/web-api/src/app/state.rs +++ b/web-api/src/app/state.rs @@ -20,16 +20,13 @@ use actix_web::{ FromRequest, HttpRequest, }; -use derive_more::AsRef; +use derive_more::{AsRef, Deref}; use futures_util::future::{ready, Ready}; -use xayn_ai_coi::CoiSystem; -use xayn_snippet_extractor::pool::SnippetExtractorPool; use xayn_web_api_db_ctrl::Silo; use xayn_web_api_shared::request::TenantId; use crate::{ - app::SetupError, - config::WebApiConfig, + app::{Application, SetupError}, embedding::Embedder, error::common::InternalError, extractor::TextExtractor, @@ -38,19 +35,25 @@ use crate::{ Error, }; -#[derive(AsRef)] -pub(crate) struct AppState { +#[derive(AsRef, Deref)] +pub(crate) struct AppState +where + A: Application, +{ #[as_ref(forward)] - pub(crate) config: WebApiConfig, + pub(crate) config: A::Config, pub(crate) embedder: Embedder, pub(crate) extractor: TextExtractor, - pub(crate) snippet_extractor: SnippetExtractorPool, - pub(crate) coi: CoiSystem, + #[deref] + pub(crate) extension: A::Extension, storage_builder: Arc, silo: Arc, } -impl AppState { +impl AppState +where + A: Application, +{ pub(super) fn attach_to(self: Arc, service: &mut ServiceConfig) { service .app_data(self.storage_builder.clone()) @@ -58,20 +61,19 @@ impl AppState { .app_data(Data::from(self)); } - pub(super) async fn create(config: WebApiConfig) -> Result { + pub(super) async fn create(config: A::Config) -> Result { + let extension = A::create_extension(&config)?; // embedder config is validated during loading let embedder = Embedder::load(config.as_ref()).await?; let extractor = TextExtractor::new(config.as_ref())?; let (silo, legacy_tenant) = initialize_silo(config.as_ref(), config.as_ref(), embedder.embedding_size()).await?; let storage_builder = Arc::new(Storage::builder(config.as_ref(), legacy_tenant).await?); - let snippet_extractor = SnippetExtractorPool::new(config.as_ref())?; Ok(Self { - coi: config.coi.clone().build(), config, embedder, extractor, - snippet_extractor, + extension, storage_builder, silo: Arc::new(silo), }) diff --git a/web-api/src/backoffice.rs b/web-api/src/backoffice.rs deleted file mode 100644 index d95deb36d..000000000 --- a/web-api/src/backoffice.rs +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2022 Xayn AG -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as -// published by the Free Software Foundation, version 3. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -pub(crate) mod preprocessor; -pub(crate) mod routes; - -use anyhow::bail; -use serde::{Deserialize, Serialize}; - -use crate::{app::SetupError, storage::elastic::IndexUpdateConfig}; - -#[derive(Debug, Deserialize, Serialize)] -#[serde(default)] -#[cfg_attr(test, serde(deny_unknown_fields))] -pub struct IngestionConfig { - pub(crate) max_document_batch_size: usize, - pub(crate) max_indexed_properties: usize, - pub(crate) index_update: IndexUpdateConfig, - pub(crate) max_snippet_size: usize, - pub(crate) max_properties_size: usize, - pub(crate) max_properties_string_size: usize, -} - -impl Default for IngestionConfig { - fn default() -> Self { - Self { - max_document_batch_size: 100, - // 10 + publication_date - max_indexed_properties: 11, - index_update: IndexUpdateConfig::default(), - max_snippet_size: 2_048, - max_properties_size: 2_560, - max_properties_string_size: 2_048, - } - } -} - -impl IngestionConfig { - pub(crate) fn validate(&self) -> Result<(), SetupError> { - if self.max_indexed_properties == 0 { - bail!("invalid IngestionConfig, max_indexed_properties must be > 0 to account for publication_date"); - } - self.index_update.validate()?; - - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_validate_default_ingestion_config() { - IngestionConfig::default().validate().unwrap(); - } -} diff --git a/web-api/src/bin/web-api.rs b/web-api/src/bin/ingestion.rs similarity index 84% rename from web-api/src/bin/web-api.rs rename to web-api/src/bin/ingestion.rs index 04e766a10..e185bb244 100644 --- a/web-api/src/bin/web-api.rs +++ b/web-api/src/bin/ingestion.rs @@ -13,16 +13,17 @@ // along with this program. If not, see . use tracing::instrument; -use xayn_web_api::{application_names, config, logging, start, Application, WebApi}; +use xayn_web_api::{application_names, config, logging, start, Application, Ingestion}; -type Config = ::Config; +type Config = ::Config; #[tokio::main] #[instrument(err)] async fn main() -> Result<(), anyhow::Error> { let config: Config = config::load(application_names!()); logging::initialize_global(config.as_ref())?; - config.validate()?; - - start::(config).await?.wait_for_termination().await + start::(config) + .await? + .wait_for_termination() + .await } diff --git a/web-api/src/web_api.rs b/web-api/src/bin/personalization.rs similarity index 52% rename from web-api/src/web_api.rs rename to web-api/src/bin/personalization.rs index 628b7db30..74570cbf2 100644 --- a/web-api/src/web_api.rs +++ b/web-api/src/bin/personalization.rs @@ -12,25 +12,18 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use actix_web::web::ServiceConfig; -use async_trait::async_trait; +use tracing::instrument; +use xayn_web_api::{application_names, config, logging, start, Application, Personalization}; -use crate::{app::Application, config::WebApiConfig}; +type Config = ::Config; -pub struct WebApi; - -#[async_trait] -impl Application for WebApi { - const NAME: &'static str = "XAYN_WEB_API"; - - type Config = WebApiConfig; - - fn configure_service(config: &mut ServiceConfig) { - crate::backoffice::routes::configure_service(config); - crate::frontoffice::routes::configure_service(config); - } - - fn configure_ops_service(config: &mut ServiceConfig) { - crate::backoffice::routes::configure_ops_service(config); - } +#[tokio::main] +#[instrument(err)] +async fn main() -> Result<(), anyhow::Error> { + let config: Config = config::load(application_names!()); + logging::initialize_global(config.as_ref())?; + start::(config) + .await? + .wait_for_termination() + .await } diff --git a/web-api/src/config.rs b/web-api/src/config.rs index 005eba893..d77a9fb8a 100644 --- a/web-api/src/config.rs +++ b/web-api/src/config.rs @@ -17,53 +17,13 @@ mod cli; use std::{ffi::OsString, fmt::Display, path::Path, process::exit}; use clap::{CommandFactory, Parser}; -use derive_more::AsRef; use figment::{ providers::{Env, Format, Serialized, Toml}, Figment, }; -use serde::{de::DeserializeOwned, Deserialize, Serialize}; -use xayn_ai_coi::CoiConfig; +use serde::{de::DeserializeOwned, Serialize}; use self::cli::Args; -use crate::{ - backoffice::IngestionConfig, - embedding, - extractor, - frontoffice::{PersonalizationConfig, SemanticSearchConfig}, - logging, - net, - storage::{self}, - tenants, - SetupError, -}; - -#[derive(AsRef, Debug, Default, Deserialize, Serialize)] -#[serde(default)] -#[cfg_attr(test, serde(deny_unknown_fields))] -pub struct WebApiConfig { - pub(crate) logging: logging::Config, - pub(crate) net: net::Config, - pub(crate) storage: storage::Config, - pub(crate) coi: CoiConfig, - pub(crate) embedding: embedding::Config, - pub(crate) text_extractor: extractor::Config, - pub(crate) personalization: PersonalizationConfig, - pub(crate) semantic_search: SemanticSearchConfig, - pub(crate) ingestion: IngestionConfig, - pub(crate) snippet_extractor: xayn_snippet_extractor::Config, - pub(crate) tenants: tenants::Config, -} - -impl WebApiConfig { - pub fn validate(&self) -> Result<(), SetupError> { - self.ingestion.validate()?; - self.personalization.validate()?; - self.semantic_search.validate()?; - - Ok(()) - } -} /// Loads the config with custom CLI args. /// diff --git a/web-api/src/extractor.rs b/web-api/src/extractor.rs index bbb979e6d..ebc95415d 100644 --- a/web-api/src/extractor.rs +++ b/web-api/src/extractor.rs @@ -29,8 +29,8 @@ use url::Url; use xayn_web_api_shared::{elastic::SegmentableUrl, serde::serde_duration_in_config}; use crate::{ - backoffice::preprocessor::PreprocessError, error::common::{FileUploadNotEnabled, InvalidBinary}, + ingestion::preprocessor::PreprocessError, models::DocumentSnippet, }; diff --git a/web-api/src/ingestion.rs b/web-api/src/ingestion.rs new file mode 100644 index 000000000..dabf6d6f1 --- /dev/null +++ b/web-api/src/ingestion.rs @@ -0,0 +1,153 @@ +// Copyright 2022 Xayn AG +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, version 3. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +pub(crate) mod preprocessor; +mod routes; + +use actix_web::web::ServiceConfig; +use anyhow::bail; +use async_trait::async_trait; +use derive_more::AsRef; +use futures_util::TryFutureExt; +use serde::{Deserialize, Serialize}; +use xayn_snippet_extractor::pool::SnippetExtractorPool; + +use self::{ + preprocessor::{preprocess, PreprocessError}, + routes::InputData, +}; +use crate::{ + app::{self, Application, SetupError}, + embedding::{self, EmbeddingKind}, + extractor, + logging, + models::{DocumentContent, PreprocessingStep}, + net, + storage::{self, elastic::IndexUpdateConfig}, + tenants, + Error, +}; + +pub struct Ingestion; + +#[async_trait] +impl Application for Ingestion { + const NAME: &'static str = "XAYN_INGESTION"; + + type Config = Config; + type Extension = Extension; + + fn configure_service(config: &mut ServiceConfig) { + routes::configure_service(config); + } + + fn configure_ops_service(config: &mut ServiceConfig) { + routes::configure_ops_service(config); + } + + fn create_extension(config: &Self::Config) -> Result { + config.ingestion.validate()?; + + let snippet_extractor = SnippetExtractorPool::new(config.as_ref())?; + + Ok(Extension { snippet_extractor }) + } +} + +type AppState = app::AppState; + +impl AppState { + pub(crate) async fn preprocess( + &self, + kind: EmbeddingKind, + original: InputData, + preprocessing_step: &mut PreprocessingStep, + ) -> Result, PreprocessError> { + preprocess( + &self.embedder, + || self.snippet_extractor.get().map_err(Error::from), + &self.extractor, + kind, + original, + preprocessing_step, + ) + .await + } +} + +#[derive(AsRef, Debug, Default, Deserialize, Serialize)] +#[serde(default)] +#[cfg_attr(test, serde(deny_unknown_fields))] +pub struct Config { + pub(crate) logging: logging::Config, + pub(crate) net: net::Config, + pub(crate) storage: storage::Config, + pub(crate) ingestion: IngestionConfig, + pub(crate) embedding: embedding::Config, + pub(crate) text_extractor: extractor::Config, + pub(crate) tenants: tenants::Config, + pub(crate) snippet_extractor: xayn_snippet_extractor::Config, +} + +#[derive(Debug, Deserialize, Serialize)] +#[serde(default)] +#[cfg_attr(test, serde(deny_unknown_fields))] +pub struct IngestionConfig { + pub(crate) max_document_batch_size: usize, + pub(crate) max_indexed_properties: usize, + pub(crate) index_update: IndexUpdateConfig, + pub(crate) max_snippet_size: usize, + pub(crate) max_properties_size: usize, + pub(crate) max_properties_string_size: usize, +} + +impl Default for IngestionConfig { + fn default() -> Self { + Self { + max_document_batch_size: 100, + // 10 + publication_date + max_indexed_properties: 11, + index_update: IndexUpdateConfig::default(), + max_snippet_size: 2_048, + max_properties_size: 2_560, + max_properties_string_size: 2_048, + } + } +} + +impl IngestionConfig { + fn validate(&self) -> Result<(), SetupError> { + if self.max_indexed_properties == 0 { + bail!("invalid IngestionConfig, max_indexed_properties must be > 0 to account for publication_date"); + } + self.index_update.validate()?; + + Ok(()) + } +} + +#[derive(AsRef)] +pub struct Extension { + snippet_extractor: SnippetExtractorPool, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate_default_ingestion_config() { + IngestionConfig::default().validate().unwrap(); + } +} diff --git a/web-api/src/backoffice/preprocessor.rs b/web-api/src/ingestion/preprocessor.rs similarity index 100% rename from web-api/src/backoffice/preprocessor.rs rename to web-api/src/ingestion/preprocessor.rs diff --git a/web-api/src/backoffice/routes.rs b/web-api/src/ingestion/routes.rs similarity index 97% rename from web-api/src/backoffice/routes.rs rename to web-api/src/ingestion/routes.rs index b8207f4b8..81321497b 100644 --- a/web-api/src/backoffice/routes.rs +++ b/web-api/src/ingestion/routes.rs @@ -21,10 +21,7 @@ use actix_web::{ }; use anyhow::anyhow; use base64::{engine::general_purpose, Engine as _}; -use futures_util::{ - stream::{FuturesOrdered, StreamExt}, - TryFutureExt, -}; +use futures_util::stream::{FuturesOrdered, StreamExt}; use itertools::{Either, Itertools}; use reqwest::StatusCode; use serde::{Deserialize, Serialize}; @@ -33,11 +30,9 @@ use tokio::time::Instant; use tracing::{debug, error, info, instrument}; use xayn_web_api_db_ctrl::{Operation, Silo}; -use super::preprocessor::PreprocessError; +use super::{preprocessor::PreprocessError, AppState}; use crate::{ - app::{AppState, TenantState}, - backoffice, - backoffice::IngestionConfig, + app::TenantState, embedding::EmbeddingKind, error::common::{ BadRequest, @@ -51,6 +46,7 @@ use crate::{ FileUploadNotEnabled, InvalidDocumentSnippet, }, + ingestion::IngestionConfig, models::{ self, DocumentId, @@ -67,7 +63,7 @@ use crate::{ Error, }; -pub(crate) fn configure_service(config: &mut ServiceConfig) { +pub(super) fn configure_service(config: &mut ServiceConfig) { config .service( web::resource("/documents") @@ -111,7 +107,7 @@ pub(crate) fn configure_service(config: &mut ServiceConfig) { ); } -pub(crate) fn configure_ops_service(config: &mut ServiceConfig) { +pub(super) fn configure_ops_service(config: &mut ServiceConfig) { config.service(web::resource("/silo_management").route(web::post().to(silo_management))); } @@ -494,17 +490,9 @@ async fn upsert_documents( let id = document.id; let original_sha256 = Sha256Hash::calculate(document.original.as_bytes()); - let result = backoffice::preprocessor::preprocess( - &state.embedder, - || state.snippet_extractor.get().map_err(Error::from), - &state.extractor, - EmbeddingKind::Content, - document.original, - &mut document.preprocessing_step, - ) - .await; - - match result + match state + .preprocess(EmbeddingKind::Content, document.original, &mut document.preprocessing_step) + .await { Ok(snippets) => Ok(models::DocumentForIngestion { id, diff --git a/web-api/src/lib.rs b/web-api/src/lib.rs index 7899b752d..dbb7d6f40 100644 --- a/web-api/src/lib.rs +++ b/web-api/src/lib.rs @@ -32,30 +32,29 @@ )] mod app; -mod backoffice; pub mod config; mod embedding; mod error; pub mod extractor; -mod frontoffice; +mod ingestion; pub mod logging; mod middleware; #[cfg(test)] mod mind; mod models; mod net; +mod personalization; pub mod rank_merge; mod storage; mod tenants; mod utils; -mod web_api; pub use crate::{ app::{start, Application, SetupError}, error::application::{ApplicationError, Error}, - frontoffice::{bench_derive_interests, bench_rerank}, + ingestion::Ingestion, net::AppHandle, - web_api::WebApi, + personalization::{bench_derive_interests, bench_rerank, Personalization}, }; /// Allow migration tests to have access to the elastic search mapping this uses. diff --git a/web-api/src/mind.rs b/web-api/src/mind.rs index ceb919dca..8d7d3f346 100644 --- a/web-api/src/mind.rs +++ b/web-api/src/mind.rs @@ -27,13 +27,13 @@ use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; use xayn_test_utils::error::Panic; use crate::{ - frontoffice::routes::PersonalizeBy, mind::{ config::{GridSearchConfig, PersonaBasedConfig, SaturationConfig, StateConfig}, data::{read, DocumentProvider, Impression, Ndcg, SpecificTopics, Users}, state::{SaturationIteration, SaturationResult, SaturationTopicResult, State}, }, models::{SnippetOrDocumentId, UserId}, + personalization::routes::PersonalizeBy, storage::memory::Storage, }; diff --git a/web-api/src/mind/config.rs b/web-api/src/mind/config.rs index 9ebce07cb..20f028ced 100644 --- a/web-api/src/mind/config.rs +++ b/web-api/src/mind/config.rs @@ -17,7 +17,7 @@ use serde::{Deserialize, Serialize}; use xayn_ai_coi::CoiConfig; use xayn_test_utils::error::Panic; -use crate::frontoffice::{routes::PersonalizeBy, PersonalizationConfig}; +use crate::personalization::{routes::PersonalizeBy, PersonalizationConfig}; #[derive(Debug, Serialize)] pub(super) struct StateConfig { diff --git a/web-api/src/mind/state.rs b/web-api/src/mind/state.rs index 9f009d6bb..8de4d5336 100644 --- a/web-api/src/mind/state.rs +++ b/web-api/src/mind/state.rs @@ -24,10 +24,6 @@ use xayn_test_utils::{ use crate::{ embedding::{self, Embedder, EmbeddingKind, Pipeline}, - frontoffice::{ - routes::{personalize_documents_by, update_interactions, PersonalizeBy}, - PersonalizationConfig, - }, mind::{config::StateConfig, data::Document}, models::{ DocumentContent, @@ -39,6 +35,10 @@ use crate::{ SnippetOrDocumentId, UserId, }, + personalization::{ + routes::{personalize_documents_by, update_interactions, PersonalizeBy}, + PersonalizationConfig, + }, storage::{self, memory::Storage}, }; diff --git a/web-api/src/models.rs b/web-api/src/models.rs index f43e4f15a..9961473fb 100644 --- a/web-api/src/models.rs +++ b/web-api/src/models.rs @@ -573,7 +573,7 @@ impl PreprocessingStep { #[cfg(test)] mod tests { use super::*; - use crate::frontoffice::SemanticSearchConfig; + use crate::personalization::SemanticSearchConfig; impl TryFrom for DocumentProperty { type Error = InvalidDocumentProperty; diff --git a/web-api/src/frontoffice.rs b/web-api/src/personalization.rs similarity index 76% rename from web-api/src/frontoffice.rs rename to web-api/src/personalization.rs index 661b6da79..42c19a061 100644 --- a/web-api/src/frontoffice.rs +++ b/web-api/src/personalization.rs @@ -20,11 +20,66 @@ mod stateless; use std::ops::RangeBounds; +use actix_web::web::ServiceConfig; use anyhow::bail; +use async_trait::async_trait; +use derive_more::AsRef; use serde::{Deserialize, Serialize}; +use xayn_ai_coi::{CoiConfig, CoiSystem}; pub use self::{rerank::bench_rerank, stateless::bench_derive_interests}; -use crate::app::SetupError; +use crate::{ + app::{self, Application, SetupError}, + embedding, + extractor, + logging, + net, + storage, + tenants, +}; + +pub struct Personalization; + +#[async_trait] +impl Application for Personalization { + const NAME: &'static str = "XAYN_PERSONALIZATION"; + + type Config = Config; + type Extension = Extension; + + fn configure_service(config: &mut ServiceConfig) { + routes::configure_service(config); + } + + fn create_extension(config: &Self::Config) -> Result { + if let Err(error) = config.coi.validate() { + bail!("invalid CoiConfig, {error}"); + } + config.personalization.validate()?; + config.semantic_search.validate()?; + + Ok(Extension { + coi: config.coi.clone().build(), + }) + } +} + +type AppState = app::AppState; + +#[derive(AsRef, Debug, Default, Deserialize, Serialize)] +#[serde(default)] +#[cfg_attr(test, serde(deny_unknown_fields))] +pub struct Config { + pub(crate) logging: logging::Config, + pub(crate) net: net::Config, + pub(crate) storage: storage::Config, + pub(crate) coi: CoiConfig, + pub(crate) embedding: embedding::Config, + pub(crate) text_extractor: extractor::Config, + pub(crate) personalization: PersonalizationConfig, + pub(crate) semantic_search: SemanticSearchConfig, + pub(crate) tenants: tenants::Config, +} #[derive(Clone, Debug, Deserialize, Serialize)] #[serde(default)] @@ -74,7 +129,7 @@ impl Default for PersonalizationConfig { } impl PersonalizationConfig { - pub(crate) fn validate(&self) -> Result<(), SetupError> { + fn validate(&self) -> Result<(), SetupError> { if self.max_number_documents > self.max_number_candidates { // this is stricter than necessary, but ok for our use cases bail!("invalid PersonalizationConfig, max_number_documents must be <= max_number_candidates"); @@ -130,7 +185,7 @@ impl Default for SemanticSearchConfig { } impl SemanticSearchConfig { - pub(crate) fn validate(&self) -> Result<(), SetupError> { + fn validate(&self) -> Result<(), SetupError> { if self.max_number_documents > self.max_number_candidates { // this is stricter than necessary, but ok for our use cases bail!("invalid SemanticSearchConfig, max_number_documents must be <= max_number_candidates"); @@ -146,6 +201,11 @@ impl SemanticSearchConfig { } } +#[derive(AsRef)] +pub struct Extension { + pub(crate) coi: CoiSystem, +} + #[cfg(test)] mod tests { use super::*; diff --git a/web-api/src/frontoffice/filter.rs b/web-api/src/personalization/filter.rs similarity index 100% rename from web-api/src/frontoffice/filter.rs rename to web-api/src/personalization/filter.rs diff --git a/web-api/src/frontoffice/knn.rs b/web-api/src/personalization/knn.rs similarity index 98% rename from web-api/src/frontoffice/knn.rs rename to web-api/src/personalization/knn.rs index d9d39e3d4..db199f2b4 100644 --- a/web-api/src/frontoffice/knn.rs +++ b/web-api/src/personalization/knn.rs @@ -22,8 +22,8 @@ use xayn_ai_coi::{compute_coi_weights, Coi}; use crate::{ error::common::InternalError, - frontoffice::filter::Filter, models::{PersonalizedDocument, SnippetId}, + personalization::filter::Filter, rank_merge::{rrf_score, DEFAULT_RRF_K}, storage::{self, Exclusions, KnnSearchParams, SearchStrategy}, Error, @@ -148,7 +148,7 @@ mod tests { use xayn_ai_coi::CoiConfig; use super::*; - use crate::{frontoffice::PersonalizationConfig, storage::memory::Storage}; + use crate::{personalization::PersonalizationConfig, storage::memory::Storage}; #[tokio::test] async fn test_search_knn_documents_for_empty_cois() { diff --git a/web-api/src/frontoffice/rerank.rs b/web-api/src/personalization/rerank.rs similarity index 100% rename from web-api/src/frontoffice/rerank.rs rename to web-api/src/personalization/rerank.rs diff --git a/web-api/src/frontoffice/routes.rs b/web-api/src/personalization/routes.rs similarity index 99% rename from web-api/src/frontoffice/routes.rs rename to web-api/src/personalization/routes.rs index d4372f956..c90c2e122 100644 --- a/web-api/src/frontoffice/routes.rs +++ b/web-api/src/personalization/routes.rs @@ -37,11 +37,12 @@ use super::{ HistoryEntry, UnvalidatedHistoryEntry, }, + AppState, PersonalizationConfig, SemanticSearchConfig, }; use crate::{ - app::{AppState, TenantState}, + app::TenantState, embedding::EmbeddingKind, error::{ common::{BadRequest, DocumentNotFound, ForbiddenDevOption, InvalidDocumentCount}, @@ -72,7 +73,7 @@ use crate::{ Error, }; -pub(crate) fn configure_service(config: &mut ServiceConfig) { +pub(super) fn configure_service(config: &mut ServiceConfig) { let users = web::scope("/users/{user_id}") .service(web::resource("interactions").route(web::patch().to(interactions))) .service(web::resource("recommendations").route(web::post().to(user_recommendations))) diff --git a/web-api/src/frontoffice/stateless.rs b/web-api/src/personalization/stateless.rs similarity index 100% rename from web-api/src/frontoffice/stateless.rs rename to web-api/src/personalization/stateless.rs diff --git a/web-api/src/storage.rs b/web-api/src/storage.rs index 3e0ee2a0c..56c424435 100644 --- a/web-api/src/storage.rs +++ b/web-api/src/storage.rs @@ -34,8 +34,7 @@ use xayn_web_api_shared::{postgres as postgres_shared, request::TenantId}; use self::property_filter::{IndexedPropertiesSchema, IndexedPropertiesSchemaUpdate}; use crate::{ app::SetupError, - backoffice::IngestionConfig, - frontoffice::filter::Filter, + ingestion::IngestionConfig, models::{ self, DocumentForIngestion, @@ -51,6 +50,7 @@ use crate::{ SnippetOrDocumentId, UserId, }, + personalization::filter::Filter, tenants, Error, }; diff --git a/web-api/src/storage/elastic/filter.rs b/web-api/src/storage/elastic/filter.rs index e7a9cabbe..bbba7c8c8 100644 --- a/web-api/src/storage/elastic/filter.rs +++ b/web-api/src/storage/elastic/filter.rs @@ -17,8 +17,8 @@ use std::{borrow::Cow, cmp::Ordering, collections::HashMap}; use serde::{Serialize, Serializer}; use crate::{ - frontoffice::filter::{self, Combine, CombineOp, Compare, CompareOp}, models::{DocumentId, DocumentProperty, DocumentPropertyId, SnippetId}, + personalization::filter::{self, Combine, CombineOp, Compare, CompareOp}, storage::Exclusions, }; diff --git a/web-api/src/storage/postgres.rs b/web-api/src/storage/postgres.rs index de3a189b1..efd476b79 100644 --- a/web-api/src/storage/postgres.rs +++ b/web-api/src/storage/postgres.rs @@ -56,7 +56,7 @@ use super::{ TagWeights, }; use crate::{ - backoffice::IngestionConfig, + ingestion::IngestionConfig, models::{ DocumentContent, DocumentDevData,