Merge pull request #1683 from microsoft/staging

Staging into main (small changes)
recommenders-team · Mar 29, 2022 · 6987858 · 6987858
2 parents c4435a9 + 6eb39ff
commit 6987858
Show file tree

Hide file tree

Showing 16 changed files with 295 additions and 172 deletions.
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -184,7 +184,7 @@ jobs:
   build-gpu:
     runs-on: [self-hosted, Linux, gpu, nightly] # this is a union of labels to select specific self-hosted machine
     needs: static-analysis
-    timeout-minutes: 420
+    timeout-minutes: 480
     strategy:
       matrix:
         python: [3.7]

diff --git a/.github/workflows/sarplus.yml b/.github/workflows/sarplus.yml
@@ -6,6 +6,7 @@
 #       + [scala](https://github.com/actions/starter-workflows/blob/main/ci/scala.yml)
 #   * [GitHub hosted runner - Ubuntu 20.04 LTS](https://github.com/actions/virtual-environments/blob/main/images/linux/Ubuntu2004-README.md)
 #   * [Azure Databricks runtime releases](https://docs.microsoft.com/en-us/azure/databricks/release-notes/runtime/releases)
+#   * [Azure Synapse Analytics runtimes](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-version-support)
 
 
 name: sarplus test and package
@@ -42,7 +43,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          python -m pip install -U build pip twine
+          python -m pip install -U build cibuildwheel pip twine
           python -m pip install -U flake8 pytest pytest-cov scikit-learn
 
       - name: Lint with flake8
@@ -57,16 +58,19 @@ jobs:
           cd "${PYTHON_ROOT}"
           cp "${SARPLUS_ROOT}/VERSION" ./pysarplus/VERSION
           python -m build --sdist
+          PYTHON_VER='${{ matrix.python-version }}'
+          MINOR_VER="${PYTHON_VER#*.}"
+          CIBW_BUILD="cp3${MINOR_VER}-manylinux_x86_64" python -m cibuildwheel --platform linux --output-dir dist
           python -m twine check dist/*
 
-          # set sarplus_version
+          # set sarplus_version for the artifact upload step
           SARPLUS_VERSION=$(cat "${SARPLUS_ROOT}/VERSION")
           echo "sarplus_version=${SARPLUS_VERSION}" >> $GITHUB_ENV
 
       - name: Test
         run: |
           cd "${PYTHON_ROOT}"
-          python -m pip install dist/*.tar.gz
+          python -m pip install dist/*.whl
 
           cd "${SCALA_ROOT}"
           export SPARK_VERSION=$(python -m pip show pyspark | grep -i version | cut -d ' ' -f 2)
@@ -82,34 +86,42 @@ jobs:
           cd "${PYTHON_ROOT}"
           pytest ./tests
 
-      - name: Upload Python package as GitHub artifact
+      - name: Upload Python wheel as GitHub artifact when merged into main
+        # Upload the whl file of the specific python version
+        if: github.ref == 'refs/heads/main'
+        uses: actions/upload-artifact@v2
+        with:
+          name: pysarplus-${{ env.sarplus_version }}-cp${{ matrix.python-version }}-wheel
+          path: ${{ env.PYTHON_ROOT }}/dist/*.whl
+
+      - name: Upload Python source as GitHub artifact when merged into main
+        # Only one pysarplus source tar file is needed
         if: github.ref == 'refs/heads/main' && matrix.python-version == '3.10'
         uses: actions/upload-artifact@v2
         with:
-          name: pysarplus-${{ env.sarplus_version }}
+          name: pysarplus-${{ env.sarplus_version }}-source
           path: ${{ env.PYTHON_ROOT }}/dist/*.tar.gz
 
-  scala-test:
-    # Test sarplus with different versions of Databricks runtime, 2 LTSs and 1
-    # latest.
+  scala:
+    # Test sarplus with different versions of Databricks and Synapse runtime
     runs-on: ubuntu-latest
     strategy:
       matrix:
         include:
+          # For Azure Databricks 7.3 LTS
           - scala-version: "2.12.10"
             spark-version: "3.0.1"
             hadoop-version: "2.7.4"
-            databricks-runtime: "ADB 7.3 LTS"
 
+          # For Azure Databricks 9.1 LTS and Azure Synapse Apache Spark 3.1 runtime
           - scala-version: "2.12.10"
             spark-version: "3.1.2"
             hadoop-version: "2.7.4"
-            databricks-runtime: "ADB 9.1 LTS"
 
+          # For Azure Databricks 10.4 LTS
           - scala-version: "2.12.14"
-            spark-version: "3.2.0"
+            spark-version: "3.2.1"
             hadoop-version: "3.3.1"
-            databricks-runtime: "ADB 10.0"
 
     steps:
       - uses: actions/checkout@v2
@@ -121,33 +133,19 @@ jobs:
           export HADOOP_VERSION="${{ matrix.hadoop-version }}"
           sbt ++${{ matrix.scala-version }}! test
 
-  scala-package:
-    # Package sarplus and upload as GitHub workflow artifact when merged into
-    # the main branch.
-    needs: scala-test
-    if: github.ref == 'refs/heads/main'
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-
       - name: Package
+        # Generate jar files for Spark below 3.2 and above 3.2
+        if: github.ref == 'refs/heads/main' && matrix.spark-version != '3.0.1'
         env:
           GPG_KEY: ${{ secrets.SARPLUS_GPG_PRI_KEY_ASC }}
         run: |
           SARPLUS_VERSION=$(cat "${SARPLUS_ROOT}/VERSION")
 
           # generate artifacts
           cd "${SCALA_ROOT}"
-          export SPARK_VERSION="3.1.2"
-          export HADOOP_VERSION="2.7.4"
-          export SCALA_VERSION="2.12.10"
-          sbt ++${SCALA_VERSION}! package
-          sbt ++${SCALA_VERSION}! packageDoc
-          sbt ++${SCALA_VERSION}! packageSrc
-          sbt ++${SCALA_VERSION}! makePom
-          export SPARK_VERSION="3.2.0"
-          export HADOOP_VERSION="3.3.1"
-          export SCALA_VERSION="2.12.14"
+          export SPARK_VERSION='${{ matrix.spark-version }}'
+          export HADOOP_VERSION='${{ matrix.hadoop-version }}'
+          export SCALA_VERSION='${{ matrix.scala-version }}'
           sbt ++${SCALA_VERSION}! package
           sbt ++${SCALA_VERSION}! packageDoc
           sbt ++${SCALA_VERSION}! packageSrc
@@ -159,14 +157,14 @@ jobs:
           for file in {*.jar,*.pom}; do gpg -ab "${file}"; done
 
           # bundle
-          jar cvf sarplus-bundle_2.12-${SARPLUS_VERSION}.jar sarplus_*.jar sarplus_*.pom sarplus_*.asc
-          jar cvf sarplus-spark-3.2-plus-bundle_2.12-${SARPLUS_VERSION}.jar sarplus-spark*.jar sarplus-spark*.pom sarplus-spark*.asc
+          jar cvf sarplus-${SARPLUS_VERSION}-bundle_2.12-spark-${SPARK_VERSION}.jar sarplus_*.jar sarplus_*.pom sarplus_*.asc
 
-          # set sarplus_version
+          # set sarplus_version for the artifact upload step
           echo "sarplus_version=${SARPLUS_VERSION}" >> $GITHUB_ENV
-          
-      - name: Upload Scala bundle as GitHub artifact
+
+      - name: Upload Scala bundle as GitHub artifact when merged into main
+        if: github.ref == 'refs/heads/main'
         uses: actions/upload-artifact@v2
         with:
-          name: sarplus-bundle_2.12-${{ env.sarplus_version }}
+          name: sarplus-${{ env.sarplus_version }}-bundle_2.12-spark-${{ matrix.spark-version }}-jar
           path: ${{ env.SCALA_ROOT }}/target/scala-2.12/*bundle*.jar
diff --git a/contrib/sarplus/DEVELOPMENT.md b/contrib/sarplus/DEVELOPMENT.md
@@ -11,11 +11,13 @@ Steps to package and publish (also described in
 
    ```bash
    # build dependencies
-   python -m pip install -U build pip twine
+   python -m pip install -U build cibuildwheel pip twine
 
    cd python
    cp ../VERSION ./pysarplus/  # copy version file
    python -m build --sdist
+   MINOR_VERSION=$(python --version | cut -d '.' -f 2)
+   CIBW_BUILD="cp3${MINOR_VERSION}-manylinux_x86_64" python -m cibuildwheel --platform linux --output-dir dist
    python -m twine upload dist/*
    ```
 
@@ -25,31 +27,26 @@ Steps to package and publish (also described in
 
    ```bash
    export SARPLUS_VERSION=$(cat VERSION)
-   export SPARK_VERSION="3.1.2"
-   export HADOOP_VERSION="2.7.4"
-   export SCALA_VERSION="2.12.10"
    GPG_KEY="<gpg-private-key>"
+   GPG_KEY_ID="<gpg-key-id>"
+   cd scala
 
    # generate artifacts
-   cd scala
-   sbt ++${SCALA_VERSION}! package
-   sbt ++${SCALA_VERSION}! packageDoc
-   sbt ++${SCALA_VERSION}! packageSrc
-   sbt ++${SCALA_VERSION}! makePom
+   export SPARK_VERSION="3.1.2"
+   export HADOOP_VERSION="2.7.4"
+   export SCALA_VERSION="2.12.10"
+   sbt ++${SCALA_VERSION}! package packageDoc packageSrc makePom
 
-   # generate the artifact (sarplus-*-spark32.jar) for Spark 3.2+
-   export SPARK_VERSION="3.2.0"
+   # generate the artifact (sarplus-spark-3-2-plus*.jar) for Spark 3.2+
+   export SPARK_VERSION="3.2.1"
    export HADOOP_VERSION="3.3.1"
    export SCALA_VERSION="2.12.14"
-   sbt ++${SCALA_VERSION}! package
-   sbt ++${SCALA_VERSION}! packageDoc
-   sbt ++${SCALA_VERSION}! packageSrc
-   sbt ++${SCALA_VERSION}! makePom
+   sbt ++${SCALA_VERSION}! package packageDoc packageSrc makePom
 
    # sign with GPG
    cd target/scala-${SCALA_VERSION%.*}
    gpg --import <(cat <<< "${GPG_KEY}")
-   for file in {*.jar,*.pom}; do gpg -ab "${file}"; done
+   for file in {*.jar,*.pom}; do gpg -ab -u "${GPG_KEY_ID}" "${file}"; done
 
    # bundle
    jar cvf sarplus-bundle_2.12-${SARPLUS_VERSION}.jar sarplus_*.jar sarplus_*.pom sarplus_*.asc
@@ -85,7 +82,7 @@ pytest ./tests
 To test the Scala formatter
 
 ```bash
-export SPARK_VERSION=3.2.0
+export SPARK_VERSION=3.2.1
 export HADOOP_VERSION=3.3.1
 export SCALA_VERSION=2.12.14
 
@@ -97,19 +94,20 @@ sbt ++${SCALA_VERSION}! test
 ## Notes for Spark 3.x  ##
 
 The code now has been modified to support Spark 3.x, and has been
-tested under different versions of Databricks Runtime (including 6.4
-Extended Support, 7.3 LTS, 9.1 LTS, 10.0 and 10.1) on Azure Databricks
-Service.  However, there is a breaking change of
+tested under Azure Synapse Apache Spark 3.1 runtime and different
+versions of Databricks Runtime (including 6.4 Extended Support, 7.3
+LTS, 9.1 LTS and 10.4 LTS) on Azure Databricks Service.  However,
+there is a breaking change of
 [org/apache.spark.sql.execution.datasources.OutputWriter](https://github.com/apache/spark/blob/dc0fa1eef74238d745dabfdc86705b59d95b07e1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/OutputWriter.scala#L74)
 on **Spark 3.2**, which adds an extra function `path()`, so an
 additional package called [Sarplus Spark 3.2
 Plus](https://search.maven.org/artifact/com.microsoft.sarplus/sarplus-spark-3-2-plus_2.12)
 (with Maven coordinate such as
-`com.microsoft.sarplus:sarplus-spark-3-2-plus_2.12:0.6.4`) should be
+`com.microsoft.sarplus:sarplus-spark-3-2-plus_2.12:0.6.5`) should be
 used if running on Spark 3.2 instead of
 [Sarplus](https://search.maven.org/artifact/com.microsoft.sarplus/sarplus_2.12)
 (with Maven coordinate like
-`com.microsoft.sarplus:sarplus_2.12:0.6.4`).
+`com.microsoft.sarplus:sarplus_2.12:0.6.5`).
 
 In addition to `spark.sql.crossJoin.enabled true`, extra
 configurations are required when running on Spark 3.x:

diff --git a/contrib/sarplus/README.md b/contrib/sarplus/README.md
@@ -157,7 +157,7 @@ Insert this cell prior to the code above.
 ```python
 import os
 
-SARPLUS_MVN_COORDINATE = "com.microsoft.sarplus:sarplus_2.12:0.6.4"
+SARPLUS_MVN_COORDINATE = "com.microsoft.sarplus:sarplus_2.12:0.6.5"
 SUBMIT_ARGS = f"--packages {SARPLUS_MVN_COORDINATE} pyspark-shell"
 os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS
 
@@ -180,7 +180,7 @@ spark = (
 ### PySpark Shell
 
 ```bash
-SARPLUS_MVN_COORDINATE="com.microsoft.sarplus:sarplus_2.12:0.6.4"
+SARPLUS_MVN_COORDINATE="com.microsoft.sarplus:sarplus_2.12:0.6.5"
 
 # Install pysarplus
 pip install pysarplus
@@ -201,14 +201,14 @@ pyspark --packages "${SARPLUS_MVN_COORDINATE}" \
 1. Create Library
 1. Under `Library Source` select `Maven`
 1. Enter into `Coordinates`:
-   * `com.microsoft.sarplus:sarplus_2.12:0.6.4`
-   * or `com.microsoft.sarplus:sarplus-spark-3-2-plus_2.12:0.6.4` (if
+   * `com.microsoft.sarplus:sarplus_2.12:0.6.5`
+   * or `com.microsoft.sarplus:sarplus-spark-3-2-plus_2.12:0.6.5` (if
      you're on Spark 3.2+)
 1. Hit `Create`
 1. Attach to your cluster
 1. Create 2nd library
 1. Under `Library Source` select `PyPI`
-1. Enter `pysarplus==0.6.4`
+1. Enter `pysarplus==0.6.5`
 1. Hit `Create`
 
 This will install C++, Python and Scala code on your cluster.  See
@@ -232,12 +232,23 @@ for details on how to install libraries on Azure Databricks.
 These will set the crossJoin property to enable calculation of the
 similarity matrix, and set default sources to parquet.
 
+It can also be configured by putting the following Python code in a
+notebook cell:
+
+```python
+spark.conf.set("spark.sql.crossJoin.enabled", "true")
+spark.conf.set("spark.sql.sources.default", "parquet")
+spark.conf.set("spark.sql.legacy.createHiveTableByDefault", "true")
+```
+
 
 #### Prepare local file system for cache
 
-`pysarplus.SARPlus.recommend_k_items()` needs a local file system path
-as its second parameter for storing intermediate files during its
-calculation, so you'll also have to **mount** shared storage.
+To use C++ based fast prediction in
+`pysarplus.SARPlus.recommend_k_items()`, a local cache directory needs
+to be specified as the `cache_path` parameter of `pysarplus.SARPlus()`
+to store intermediate files during its calculation, so you'll also
+have to **mount** shared storage.
 
 For example, you can [create a storage
 account](https://ms.portal.azure.com/#create/Microsoft.StorageAccount)
@@ -259,9 +270,8 @@ dbutils.fs.mount(
 where `<storage-account>`, `<container>` and `<access-key>` should be
 replaced with the actual values, such as `sarplusstorage`,
 `sarpluscache` and the access key of the storage account.  Then pass
-`"dbfs:/mnt/<container>/cache"` to
-`pysarplus.SARPlus.recommend_k_items()` as the value for its 2nd
-parameter.
+`cache_path="dbfs:/mnt/<container>/cache"` to `pysarplus.SARPlus()`,
+where `cache` is the cache's name.
 
 
 To disable logging messages:
@@ -276,12 +286,17 @@ logging.getLogger("py4j").setLevel(logging.ERROR)
 
 #### Install libraries
 
-1. Download pysarplus TAR file from
+1. Download pysarplus WHL file from
    [pysarplus@PyPI](https://pypi.org/project/pysarplus/)
-1. Download sarplus JAR file from 
+1. Download sarplus JAR file
+   from [sarplus@MavenCentralRepository](https://search.maven.org/artifact/com.microsoft.sarplus/sarplus_2.12)
+
+   (or
+   [sarplus-spark-3-2-plus@MavenCentralRepository](https://search.maven.org/artifact/com.microsoft.sarplus/sarplus-spark-3-2-plus_2.12)
+   if run on Spark 3.2+)
 1. Navigate to your Azure Synapse workspace -> `Manage` -> `Workspace
    packages`
-1. Upload pysarplus TAR file and sarplus JAR file as workspace
+1. Upload pysarplus WHL file and sarplus JAR file as workspace
    packages
 1. Navigate to your Azure Synapse workspace -> `Manage` -> `Apache
    Spark pools`
@@ -291,16 +306,19 @@ logging.getLogger("py4j").setLevel(logging.ERROR)
    previous step
 1. Apply
 
-See [Manage libraries for Apache Spark in Azure Synapse
+pysarplus can also be installed via `requirements.txt`.  See [Manage
+libraries for Apache Spark in Azure Synapse
 Analytics](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-azure-portal-add-libraries)
 for details on how to manage libraries in Azure Synapse.
 
 
 #### Prepare local file system for cache
 
-`pysarplus.SARPlus.recommend_k_items()` needs a local file system path
-as its second parameter for storing intermediate files during its
-calculation, so you'll also have to **mount** shared storage.
+To use C++ based fast prediction in
+`pysarplus.SARPlus.recommend_k_items()`, a local cache directory needs
+to be specified as the `cache_path` parameter of `pysarplus.SARPlus()`
+to store intermediate files during its calculation, so you'll also
+have to **mount** shared storage.
 
 For example, you can run the following code to mount the file system
 (container) of the default/primary storage account.
@@ -315,9 +333,9 @@ mssparkutils.fs.mount(
 job_id = mssparkutils.env.getJobId()
 ```
 
-Then pass `f"synfs:/{job_id}/mnt/<container>/cache"` to
-`pysarplus.SARPlus.recommend_k_items()` as the value for its 2nd
-parameter.  **NOTE**: `job_id` should be prepended to the local path.
+Then pass `cache_path=f"synfs:/{job_id}/mnt/<container>/cache"` to
+`pysarplus.SARPlus()`, where `cache` is the cache's name.  **NOTE**:
+`job_id` should be prepended to the local path.
 
 See [How to use file mount/unmount API in Synapse](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/synapse-file-mount-api)
 for more details.

diff --git a/contrib/sarplus/VERSION b/contrib/sarplus/VERSION
@@ -1 +1 @@
-0.6.4
+0.6.5