[CI] Replace load_boston with a randomly generated regression data (#428

) * Update compatibility tester * Set up hypothesis in XGBoost integration test * Add isort config * Update test_skl_importer * Update test_gtil * Update test_lightgbm_integration * [CI] Install hypothesis * Remove unnecessary edit * [CI] Show run time for all unit tests * Relax hypothesis assertion * Exclude invalid regression cases * Relax test_lightgbm_regression * [CI] Use local temp dir in Windows * Relax
dmlc · Dec 19, 2022 · a6f4802 · a6f4802
1 parent cce130c
commit a6f4802
Show file tree

Hide file tree

Showing 17 changed files with 1,070 additions and 593 deletions.
diff --git a/.isort.cfg b/.isort.cfg
@@ -0,0 +1,3 @@
+[settings]
+profile=black
+known_first_party=treelite,treelite_runtime
diff --git a/ops/conda_env/dev.yml b/ops/conda_env/dev.yml
@@ -8,6 +8,7 @@ dependencies:
 - pandas
 - pytest
 - pytest-cov
+- hypothesis
 - scikit-learn
 - coverage
 - codecov

diff --git a/ops/cpp-python-coverage.sh b/ops/cpp-python-coverage.sh
@@ -22,7 +22,7 @@ cd ../..
 
 echo "##[section]Running Python integration tests..."
 export PYTHONPATH='./python:./runtime/python'
-python -m pytest --cov=treelite --cov=treelite_runtime -v -rxXs --fulltrace tests/python tests/cython
+python -m pytest --cov=treelite --cov=treelite_runtime -v -rxXs --fulltrace --durations=0 tests/python tests/cython
 
 echo "##[section]Collecting coverage data..."
 lcov --directory . --capture --output-file coverage.info

diff --git a/ops/macos-python-coverage.sh b/ops/macos-python-coverage.sh
@@ -17,7 +17,7 @@ cd ..
 
 ./build/treelite_cpp_test
 PYTHONPATH=./python:./runtime/python python -m pytest --cov=treelite --cov=treelite_runtime -v -rxXs \
-  --fulltrace tests/python
+  --fulltrace --durations=0 tests/python
 lcov --directory . --capture --output-file coverage.info
 lcov --remove coverage.info '*dmlccore*' --output-file coverage.info
 lcov --remove coverage.info '*fmtlib*' --output-file coverage.info

diff --git a/ops/test-linux-python-wheel.sh b/ops/test-linux-python-wheel.sh
@@ -6,7 +6,7 @@ echo "##[section]Installing Treelite into Python environment..."
 pip install python/dist/*.whl runtime/python/dist/*.whl
 
 echo "##[section]Running Python tests..."
-python -m pytest -v -rxXs --fulltrace tests/python/test_basic.py
+python -m pytest -v -rxXs --fulltrace --durations=0 tests/python/test_basic.py
 
 echo "##[section]Uploading Python wheels..."
 python -m awscli s3 cp python/dist/*.whl s3://treelite-wheels/ --acl public-read || true

diff --git a/ops/test-macos-python-wheel.sh b/ops/test-macos-python-wheel.sh
@@ -6,4 +6,4 @@ echo "##[section]Installing Treelite into Python environment..."
 pip install wheelhouse/*.whl
 
 echo "##[section]Running Python tests..."
-python -m pytest -v -rxXs --fulltrace tests/python/test_basic.py
+python -m pytest -v -rxXs --fulltrace --durations=0 tests/python/test_basic.py
diff --git a/ops/test-sdist.sh b/ops/test-sdist.sh
@@ -8,7 +8,7 @@ make pippack
 echo "##[section]Testing the source distribution..."
 python -m pip install -v treelite-*.tar.gz
 python -m pip install -v treelite_runtime-*.tar.gz
-python -m pytest -v -rxXs --fulltrace tests/python/test_basic.py
+python -m pytest -v -rxXs --fulltrace --durations=0 tests/python/test_basic.py
 
 # Deploy source distribution to S3
 for file in ./treelite-*.tar.gz ./treelite_runtime-*.tar.gz

diff --git a/ops/test-win-python-wheel.bat b/ops/test-win-python-wheel.bat
@@ -18,7 +18,7 @@ for /R %%i in (runtime\\python\\dist\\*.whl) DO (
 
 echo ##[section]Running Python tests...
 mkdir temp
-python -m pytest --basetemp="%WORKING_DIR%\temp" -v -rxXs --fulltrace tests\python\test_basic.py
+python -m pytest --basetemp="%WORKING_DIR%\temp" -v -rxXs --fulltrace --durations=0 tests\python\test_basic.py
 if %errorlevel% neq 0 exit /b %errorlevel%
 
 echo ##[section]Uploading Python wheels...

diff --git a/ops/win-python-coverage.bat b/ops/win-python-coverage.bat
@@ -14,7 +14,8 @@ mkdir temp
 call micromamba activate dev
 if %errorlevel% neq 0 exit /b %errorlevel%
 set "PYTHONPATH=./python;./runtime/python"
-python -m pytest --basetemp="%WORKING_DIR%\temp" --cov=treelite --cov=treelite_runtime --cov-report xml -v -rxXs --fulltrace tests\python
+set "PYTEST_TMPDIR=%WORKING_DIR%\temp"
+python -m pytest --basetemp="%WORKING_DIR%\temp" --cov=treelite --cov=treelite_runtime --cov-report xml -v -rxXs --fulltrace --durations=0 tests\python
 if %errorlevel% neq 0 exit /b %errorlevel%
 
 echo ##[section]Submitting code coverage data to CodeCov...

diff --git a/src/annotator.cc b/src/annotator.cc
@@ -228,7 +228,8 @@ AnnotateImpl(
   // change layout of counts
   std::vector<std::vector<uint64_t>>& counts = *out_counts;
   for (size_t i = 0; i < ntree; ++i) {
-    counts.emplace_back(&new_counts[count_row_ptr[i]], &new_counts[count_row_ptr[i + 1]]);
+    counts.emplace_back(new_counts.begin() + count_row_ptr[i],
+                        new_counts.begin() + count_row_ptr[i + 1]);
   }
 }
 

diff --git a/tests/cython/compatibility_tester.py b/tests/cython/compatibility_tester.py
@@ -1,9 +1,12 @@
-import pickle
 import argparse
+import os
+import pickle
+
+import lightgbm as lgb
 import numpy as np
-import treelite
 from sklearn.datasets import load_iris
-from sklearn.ensemble import RandomForestClassifier
+
+import treelite
 
 
 def _fetch_data():
@@ -12,7 +15,7 @@ def _fetch_data():
 
 
 def _train_model(X, y):
-    clf = RandomForestClassifier(max_depth=3, random_state=0, n_estimators=3)
+    clf = lgb.LGBMClassifier(max_depth=3, random_state=0, n_estimators=3)
     clf.fit(X, y)
     return clf
 
@@ -22,7 +25,7 @@ def save(args):
     clf = _train_model(X, y)
     with open(args.model_pickle_path, "wb") as f:
         pickle.dump(clf, f)
-    tl_model = treelite.sklearn.import_model(clf)
+    tl_model = treelite.Model.from_lightgbm(clf.booster_)
     tl_model.serialize(args.checkpoint_path)
 
 

diff --git a/tests/python/hypothesis_util.py b/tests/python/hypothesis_util.py
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+"""Utility functions for hypothesis-based testing"""
+
+from sys import platform as _platform
+
+import numpy as np
+from hypothesis.strategies import composite, integers, just, none
+from sklearn.datasets import make_regression
+
+
+@composite
+def standard_regression_datasets(
+    draw,
+    n_samples=integers(min_value=100, max_value=200),
+    n_features=integers(min_value=100, max_value=200),
+    *,
+    n_informative=None,
+    n_targets=just(1),
+    bias=just(0.0),
+    effective_rank=none(),
+    tail_strength=just(0.5),
+    noise=just(0.0),
+    shuffle=just(True),
+    random_state=None,
+):
+    """
+    Returns a strategy to generate regression problem input datasets.
+    Note:
+    This function uses the sklearn.datasets.make_regression function to
+    generate the regression problem from the provided search strategies.
+
+    Credit: Carl Simon Adorf (@csadorf)
+    https://github.com/rapidsai/cuml/blob/447bded/python/cuml/testing/strategies.py
+
+    Parameters
+    ----------
+    n_samples: SearchStrategy[int]
+        Returned arrays will have number of rows drawn from these values.
+    n_features: SearchStrategy[int]
+        Returned arrays will have number of columns drawn from these values.
+    n_informative: SearchStrategy[int], default=none
+        A search strategy for the number of informative features. If none,
+        will use 10% of the actual number of features, but not less than 1
+        unless the number of features is zero.
+    n_targets: SearchStrategy[int], default=just(1)
+        A search strategy for the number of targets, that means the number of
+        columns of the returned y output array.
+    bias: SearchStrategy[float], default=just(0.0)
+        A search strategy for the bias term.
+    effective_rank=none()
+        If not None, a search strategy for the effective rank of the input data
+        for the regression problem. See sklearn.dataset.make_regression() for a
+        detailed explanation of this parameter.
+    tail_strength: SearchStrategy[float], default=just(0.5)
+        See sklearn.dataset.make_regression() for a detailed explanation of
+        this parameter.
+    noise: SearchStrategy[float], default=just(0.0)
+        A search strategy for the standard deviation of the gaussian noise.
+    shuffle: SearchStrategy[bool], default=just(True)
+        A boolean search strategy to determine whether samples and features
+        are shuffled.
+    random_state: int, RandomState instance or None, default=None
+        Pass a random state or integer to determine the random number
+        generation for data set generation.
+    Returns
+    -------
+    (X, y):  SearchStrategy[array], SearchStrategy[array]
+        A tuple of search strategies for arrays subject to the constraints of
+        the provided parameters.
+    """
+    n_features_ = draw(n_features)
+    if n_informative is None:
+        n_informative = just(max(min(n_features_, 1), int(0.1 * n_features_)))
+    X, y = make_regression(
+        n_samples=draw(n_samples),
+        n_features=n_features_,
+        n_informative=draw(n_informative),
+        n_targets=draw(n_targets),
+        bias=draw(bias),
+        effective_rank=draw(effective_rank),
+        tail_strength=draw(tail_strength),
+        noise=draw(noise),
+        shuffle=draw(shuffle),
+        random_state=random_state,
+    )
+    return X.astype(np.float32), y.astype(np.float32)
+
+
+def standard_settings():
+    """Default hypotheiss settings. Set a smaller max_examples on Windows"""
+    kwargs = {
+        "deadline": None,
+        "max_examples": 20,
+        "print_blob": True,
+    }
+    if _platform == "win32":
+        kwargs["max_examples"] = 3
+    return kwargs