Skip to content

Commit

Permalink
Merge branch 'main' into Issue3549
Browse files Browse the repository at this point in the history
  • Loading branch information
jtruit authored Dec 20, 2024
2 parents e193684 + 5b9d7b3 commit 1093136
Show file tree
Hide file tree
Showing 8 changed files with 186 additions and 65 deletions.
24 changes: 24 additions & 0 deletions .github/workflows/feature_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,30 @@ jobs:
- name: Run Pytest
run: uv run pytest --cov-report= --cov=cumulusci

unit_tests_opt_deps:
name: "Unit tests with optional dependencies: ${{ matrix.os }}-${{ matrix.python-version }}"
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [macos-latest, SFDO-Tooling-Ubuntu, SFDO-Tooling-Windows]
python-version: ["3.11", "3.12", "3.13"]
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: "${{ matrix.python-version }}"
- name: Set up uv
uses: SFDO-Tooling/setup-uv@main
with:
version: "0.5.0"
enable-cache: true
- name: Install dependencies
run: uv sync --all-extras -p ${{ matrix.python-version }}
- name: Run Pytest
run: uv run pytest --cov-report= --cov=cumulusci

robot_api:
name: "Robot: No browser"
runs-on: SFDO-Tooling-Ubuntu
Expand Down
2 changes: 1 addition & 1 deletion cumulusci/__about__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "4.0.1"
__version__ = "4.0.1.dev0"
33 changes: 27 additions & 6 deletions cumulusci/tasks/bulkdata/select_utils.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,37 @@
import logging
import random
import re
import typing as T
from enum import Enum

import numpy as np
import pandas as pd
from annoy import AnnoyIndex
from pydantic import Field, root_validator, validator
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import StandardScaler

from cumulusci.core.enums import StrEnum
from cumulusci.tasks.bulkdata.extract_dataset_utils.hardcoded_default_declarations import (
DEFAULT_DECLARATIONS,
)
from cumulusci.tasks.bulkdata.utils import CaseInsensitiveDict
from cumulusci.utils import get_cci_upgrade_command
from cumulusci.utils.yaml.model_parser import CCIDictModel

logger = logging.getLogger(__name__)
try:
import numpy as np
import pandas as pd
from annoy import AnnoyIndex
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.preprocessing import StandardScaler

OPTIONAL_DEPENDENCIES_AVAILABLE = True
except ImportError:
logger.warning(
f"Optional dependencies are missing. "
"Handling high volumes of records for the 'select' functionality will be significantly slower, "
"as optimizations for this feature are currently disabled. "
f"To enable optimized performance, install all required dependencies using: {get_cci_upgrade_command()}[select]\n"
)
OPTIONAL_DEPENDENCIES_AVAILABLE = False


class SelectStrategy(StrEnum):
"""Enum defining the different selection strategies requested."""
Expand Down Expand Up @@ -308,7 +323,7 @@ def similarity_post_process(
select_records = []
insert_records = []

if complexity_constant < 1000:
if complexity_constant < 1000 or not OPTIONAL_DEPENDENCIES_AVAILABLE:
select_records, insert_records = levenshtein_post_process(
load_records, query_records, fields, weights, threshold
)
Expand All @@ -328,6 +343,12 @@ def annoy_post_process(
threshold: T.Union[float, None],
) -> T.Tuple[T.List[dict], list]:
"""Processes the query results for the similarity selection strategy using Annoy algorithm for large number of records"""
# Add warning when threshold is 0
if threshold is not None and threshold == 0:
logger.warning(
"Warning: A threshold of 0 may miss exact matches in high volumes. Use a small value like 0.1 for better accuracy."
)

selected_records = []
insertion_candidates = []

Expand Down
50 changes: 49 additions & 1 deletion cumulusci/tasks/bulkdata/tests/test_select_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import pytest

from cumulusci.tasks.bulkdata.select_utils import (
OPTIONAL_DEPENDENCIES_AVAILABLE,
SelectOperationExecutor,
SelectStrategy,
add_limit_offset_to_user_filter,
Expand All @@ -15,6 +15,14 @@
vectorize_records,
)

# Check for pandas availability
try:
import pandas as pd

PANDAS_AVAILABLE = True
except ImportError:
PANDAS_AVAILABLE = False


# Test Cases for standard_generate_query
def test_standard_generate_query_with_default_record_declaration():
Expand Down Expand Up @@ -511,6 +519,10 @@ def test_calculate_levenshtein_distance_weights_length_doesnt_match():
assert "Records must be same size as fields (weights)." in str(e.value)


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_all_numeric_columns():
df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", " 5.5", "6.5"]})
df_query = pd.DataFrame({"A": ["4", "5", ""], "B": ["4.5", "5.5", "6.5"]})
Expand All @@ -526,6 +538,10 @@ def test_all_numeric_columns():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_numeric_columns__one_non_numeric():
df_db = pd.DataFrame({"A": ["1", "2", "3"], "B": ["4.5", "5.5", "6.5"]})
df_query = pd.DataFrame({"A": ["4", "5", "6"], "B": ["abcd", "5.5", "6.5"]})
Expand All @@ -541,6 +557,10 @@ def test_numeric_columns__one_non_numeric():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_all_boolean_columns():
df_db = pd.DataFrame(
{"A": ["true", "false", "true"], "B": ["false", "true", "false"]}
Expand All @@ -560,6 +580,10 @@ def test_all_boolean_columns():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_all_categorical_columns():
df_db = pd.DataFrame(
{"A": ["apple", "banana", "cherry"], "B": ["dog", "cat", "mouse"]}
Expand All @@ -579,6 +603,10 @@ def test_all_categorical_columns():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_mixed_types():
df_db = pd.DataFrame(
{
Expand Down Expand Up @@ -606,6 +634,10 @@ def test_mixed_types():
assert determine_field_types(df_db, df_query, weights) == expected_output


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_vectorize_records_mixed_numerical_boolean_categorical():
# Test data with mixed types: numerical and categorical only
db_records = [["1.0", "true", "apple"], ["2.0", "false", "banana"]]
Expand Down Expand Up @@ -633,6 +665,10 @@ def test_vectorize_records_mixed_numerical_boolean_categorical():
), "Query vectors column count mismatch"


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_annoy_post_process():
# Test data
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
Expand All @@ -659,6 +695,10 @@ def test_annoy_post_process():
assert not insert_records


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_annoy_post_process__insert_records():
# Test data
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
Expand Down Expand Up @@ -714,6 +754,10 @@ def test_annoy_post_process__no_query_records():
] # The first insert record should match the second load record


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_annoy_post_process__insert_records_with_polymorphic_fields():
# Test data
load_records = [
Expand Down Expand Up @@ -749,6 +793,10 @@ def test_annoy_post_process__insert_records_with_polymorphic_fields():
] # The first insert record should match the second load record


@pytest.mark.skipif(
not PANDAS_AVAILABLE or not OPTIONAL_DEPENDENCIES_AVAILABLE,
reason="requires optional dependencies for annoy",
)
def test_single_record_match_annoy_post_process():
# Mock data where only the first query record matches the first load record
load_records = [["Alice", "Engineer"], ["Bob", "Doctor"]]
Expand Down
3 changes: 3 additions & 0 deletions docs/data.md
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,9 @@ This parameter is **optional**; if not specified, no threshold will be applied a

This feature is particularly useful during version upgrades, where records that closely match can be selected, while those that do not match sufficiently can be inserted into the target org.

**Important Note:**
For high volumes of records, an approximation algorithm is applied to improve performance. In such cases, setting a threshold of `0` may not guarantee the selection of exact matches, as the algorithm can assign a small non-zero similarity score to exact matches. To ensure accurate selection, it is recommended to set the threshold to a small value slightly greater than `0`, such as `0.1`. This ensures both precision and efficiency in the selection process.

---

#### Example
Expand Down
4 changes: 4 additions & 0 deletions docs/env-var-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,7 @@ org, e.g. a Dev Hub. Set with SFDX_CLIENT_ID.
## `SFDX_ORG_CREATE_ARGS`

Extra arguments passed to `sf org create scratch`.

To provide additional arguments, use the following format. For instance, to set the release to "preview", the environment variable would be: "--release=preview"

To specify multiple options, you can include them together, like: "--edition=developer --release=preview"
Loading

0 comments on commit 1093136

Please sign in to comment.