Patch pyarrow.open_stream to support pyarrow>0.17 (#52)

* fix: patch pyarrow.open_stream to support pyarrow>0.17 * test: import utils.logging to improve coverage --------- Co-authored-by: Thomas Petit-Jean <30775613+Thomzoy@users.noreply.github.com>
aphp · Mar 21, 2024 · e7320c7 · e7320c7
1 parent d85ef2a
commit e7320c7
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 5 deletions.
diff --git a/changelog.md b/changelog.md
@@ -1,6 +1,9 @@
 # Changelog
 
 ## Unreleased
+### Changed
+- Support for pyarrow > 0.17.0
+
 ### Added
 - biology module refacto
 - load_koalas() not by default in __init__.py but called in the improve_performance function
@@ -12,6 +15,7 @@
 - Caching in spark instead of koalas to improve speed
 
 ## v0.1.6 (2023-09-27)
+
 ### Added
 - Module ``event_sequences`` to visualize individual sequences of events.
 - Module ``age_pyramid`` to quickly visualize the age and gender distributions in a cohort.

diff --git a/docs/generate_reference.py b/docs/generate_reference.py
@@ -8,7 +8,7 @@
 
 for path in sorted(Path("eds_scikit").rglob("*.py")):
     print(path)
-    if ".ipynb_checkpoints" in path.parts:
+    if ".ipynb_checkpoints" in path.parts or "package-override" in path.parts:
         continue
     module_path = path.relative_to(".").with_suffix("")
     doc_path = path.relative_to("eds_scikit").with_suffix(".md")

diff --git a/eds_scikit/__init__.py b/eds_scikit/__init__.py
@@ -9,10 +9,31 @@
     action="ignore", category=FutureWarning
 )  # Remove pyarrow DeprecatedWarning
 
+import importlib
+import os
+import pathlib
+import sys
+import time
+from packaging import version
+from typing import List, Tuple
+from pathlib import Path
+
 import pandas as pd
+import pyarrow
+import pyarrow.ipc
+import pyspark
 from loguru import logger
-from eds_scikit.io import koalas_options, improve_performances
-import eds_scikit.biology
+from pyspark import SparkContext
+from pyspark.sql import SparkSession
+
+import eds_scikit.biology  # noqa: F401 --> To register functions
+
+pyarrow.open_stream = pyarrow.ipc.open_stream
+
+sys.path.insert(
+    0, (pathlib.Path(__file__).parent / "package-override").absolute().as_posix()
+)
+os.environ["PYTHONPATH"] = ":".join(sys.path)
 
 # Remove SettingWithCopyWarning
 pd.options.mode.chained_assignment = None

diff --git a/eds_scikit/package-override/pyarrow/__init__.py b/eds_scikit/package-override/pyarrow/__init__.py
@@ -0,0 +1,38 @@
+"""
+PySpark 2 needs pyarrow.open_stream, which was deprecated in 0.17.0 in favor of
+pyarrow.ipc.open_stream. Here is the explanation of how we monkey-patch pyarrow
+to add back pyarrow.open_stream for versions > 0.17 and how we make this work with
+pyspark distributed computing :
+1. We add this fake eds_scikit/package-override/pyarrow package to python lookup list
+   (the PYTHONPATH env var) in eds_scikit/__init__.py : this env variable will be shared
+   with the executors
+2. When an executor starts and import packages, it looks in the packages by inspecting
+   the paths in PYTHONPATH. It finds our fake pyarrow package first an executes the
+   current eds_scikit/package-override/pyarrow/__init__.py file
+3. In this file, we remove the fake pyarrow package path from the lookup list, unload
+   the current module from python modules cache (sys.modules) and re-import pyarrow
+   => the executor's python will this time load the true pyarrow and store it in
+   sys.modules. Subsequent "import pyarrow" calls will return the sys.modules["pyarrow"]
+   value, which is the true pyarrow module.
+4. We are not finished: we add back the deprecated "open_stream" function that was
+   removed in pyarrow 0.17.0 (the reason for all this hacking) by setting it
+   on the true pyarrow module
+5. We still export the pyarrow module content (*) such that the first import, which
+   is the only one that resolves to this very module, still gets what it asked for:
+   the pyarrow module's content.
+"""
+
+import sys
+
+sys.path.remove(next((p for p in sys.path if "package-override" in p), None))
+del sys.modules["pyarrow"]
+import pyarrow  # noqa: E402, F401
+
+try:
+    import pyarrow.ipc
+
+    pyarrow.open_stream = pyarrow.ipc.open_stream
+except ImportError:
+    pass
+
+from pyarrow import *  # noqa: F401, F403, E402
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,7 +42,7 @@ dependencies = [
     "regex",
     "pypandoc==1.7.5",
     "pyspark==2.4.3",
-    "pyarrow==0.17.0", #"pyarrow>=0.10, <0.17.0",
+    "pyarrow>=0.10.0",
     "pretty-html-table>=0.9.15, <0.10.0",
     "catalogue",
     "schemdraw>=0.15.0, <1.0.0",

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -7,7 +7,8 @@
 from databricks import koalas as ks
 from loguru import logger
 
-from eds_scikit.io.improve_performance import improve_performances
+import eds_scikit.utils.logging  # noqa: F401
+from eds_scikit import improve_performances
 
 from . import test_registry  # noqa: F401 --> To register functions