diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 8083adf..eff1152 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -8,7 +8,7 @@ jobs:
     strategy:
       max-parallel: 5
       matrix:
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, pypy-3.7-nightly]
 
     steps:
 
@@ -22,12 +22,15 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
 
-    - name: Install dependencies
+    - if: startsWith(matrix.python-version, 'pypy') != true
+      name: Install HPy (only for CPython)
       run: |
-        git clone -b master --single-branch https://github.com/hpyproject/hpy
+        git clone -b release/0.0.3 --single-branch https://github.com/hpyproject/hpy
         cd hpy
-        git checkout 7b45ce522
         pip install .
+
+    - name: Install dependencies
+      run: |
         pip install numpy cython pytest transonic pythran
 
     - name: Checkout
@@ -39,13 +42,17 @@ jobs:
       run: |
         python setup.py develop
         python setup.py --hpy-abi=universal develop
+        rm -f piconumpy/_piconumpy_hpy.py
 
     - name: Run tests
       run: |
-        pytest -s
+        pytest -v
 
     - name: Run bench
       run: |
         cd bench
+        make tmp_result_julia.txt
         make bench_hpy
         make
+        # let's rerun bench_hpy to get these results also at the end
+        make bench_hpy
diff --git a/README.md b/README.md
index a0bad3d..6b2a908 100644
--- a/README.md
+++ b/README.md
@@ -104,19 +104,13 @@ pypy -m pip install pip -U
 pypy -m pip install numpy cython pytest transonic pythran
 ```
 
-We need to install the correct version of HPy for the version of PyPy we are using:
+One can check which HPy version is vendored with PyPy:
 
 ```bash
 pypy -c "import hpy.universal as u; print(u.get_version())"
 ```
 
-gives `('0.0.2rc2.dev12+gc9660c2', 'c9660c2')`.
-
-```bash
-cd ~/Dev/hpy
-# update to the correct commit
-pypy setup.py develop
-```
+gives `('0.0.3', '2196f14')`.
 
 Now we can build-install PicoNumpy:
 
@@ -136,36 +130,36 @@ make
 
 ## Few results
 
-As of today (6 July 2021), HPy is not yet ready for high performance, but at
-least (with HPy 0.0.2) it runs !
+As of today (12 October 2021), HPy is not yet ready for high performance, but at
+least (with HPy 0.0.3) it runs !
 
 ### At home (Intel(R) Core(TM) i5-8400 CPU @ 2.80GHz)
 
 - With CPython
 
 ```
-Julia                      :     1 * norm = 0.00196 s
-PicoNumpy (CPython C-API)  :  9.42 * norm
-PicoNumpy (HPy CPy ABI)    :  9.95 * norm
-PicoNumpy (HPy Universal)  :  10.4 * norm
-Transonic-Pythran          : 0.497 * norm
-Numpy                      :  27.5 * norm
-PicoNumpy (purepy)         :  37.3 * norm
-PicoNumpy (purepy_array)   :  37.7 * norm
-PicoNumpy (Cython)         :  28.9 * norm
+Julia                      :     1 * norm = 0.0171 s
+PicoNumpy (CPython C-API)  :  11.1 * norm
+PicoNumpy (HPy CPy ABI)    :  11.6 * norm
+PicoNumpy (HPy Universal)  :  12.1 * norm
+Transonic-Pythran          : 0.537 * norm
+Numpy                      :  33.8 * norm
+PicoNumpy (purepy)         :  43.7 * norm
+PicoNumpy (purepy_array)   :  44.8 * norm
+PicoNumpy (Cython)         :  33.9 * norm
 ```
 
 - With PyPy3
 
 ```
-Julia                      :     1 * norm = 0.00196 s
-PicoNumpy (CPython C-API)  :  34.1 * norm
-PicoNumpy (HPy Universal)  :  12.8 * norm
-Transonic-Pythran          : 0.539 * norm
-Numpy                      :   232 * norm
-PicoNumpy (purepy)         :  4.39 * norm
-PicoNumpy (purepy_array)   :  6.33 * norm
-PicoNumpy (Cython)         :   274 * norm
+Julia                      :     1 * norm = 0.0171 s
+PicoNumpy (CPython C-API)  :  39.2 * norm
+PicoNumpy (HPy Universal)  :  13.1 * norm
+Transonic-Pythran          : 0.562 * norm
+Numpy                      :   286 * norm
+PicoNumpy (purepy)         :  5.59 * norm
+PicoNumpy (purepy_array)   :  7.41 * norm
+PicoNumpy (Cython)         :   282 * norm
 ```
 
 #### Simpler benchmarks (bench/bench_cpy_vs_hpy.py)
@@ -173,14 +167,19 @@ PicoNumpy (Cython)         :   274 * norm
 - With CPython
 
 ```
-CPython C-API:   1.92 seconds
-HPy [Universal]: 2.08 seconds
-HPy [CPy ABI]:   2.02 seconds
+{'cache_tag': 'cpython-39',
+ 'version': sys.version_info(major=3, minor=9, micro=6, releaselevel='final', serial=0)}
+CPython C-API:   0.193 seconds (11.2 * Julia)
+HPy [Universal]: 0.208 seconds (12.1 * Julia)
+HPy [CPy ABI]:   0.201 seconds (11.7 * Julia)
 ```
 
 - With PyPy3
 
 ```
-CPython C-API:   5.75 seconds
-HPy [Universal]: 2.11 seconds
+{'cache_tag': 'pypy37',
+ 'version': sys.pypy_version_info(major=7, minor=3, micro=6, releaselevel='final', serial=0)}
+CPython C-API:   0.592 seconds (34.6 * Julia)
+HPy [Universal]: 0.207 seconds (12.1 * Julia)
+Python list:     0.093 seconds ( 5.4 * Julia)
 ```
diff --git a/bench/bench.jl b/bench/bench.jl
index 00cedff..bd98571 100644
--- a/bench/bench.jl
+++ b/bench/bench.jl
@@ -65,7 +65,7 @@ function bench(n_sleds, n_time)
 end
 
 
-n_sleds = 10
+n_sleds = 100
 n_time = 200
 
 nb_runs = 200
diff --git a/bench/bench_cpy_vs_hpy.py b/bench/bench_cpy_vs_hpy.py
index e54ad54..38f1ba1 100644
--- a/bench/bench_cpy_vs_hpy.py
+++ b/bench/bench_cpy_vs_hpy.py
@@ -1,8 +1,9 @@
 import sys
-import time
+from time import perf_counter
 import random
 from math import pi, cos, sin
 from pathlib import Path
+from pprint import pprint
 
 here = Path(__file__).absolute().parent
 
@@ -75,14 +76,18 @@ def bench(mod, n_sleds, n_time):
     u_init = mod.zeros(n_sleds)
     for i in range(n_sleds):
         u_init[i] += 3.5
-    start = time.time()
-    solver(mod, board, x_init, y_init, u_init, v_init, 0.01, n_time)
-    end = time.time()
-    return end - start
+    times = []
+    for _ in range(20):
+        start = perf_counter()
+        solver(mod, board, x_init, y_init, u_init, v_init, 0.01, n_time)
+        times.append(perf_counter() - start)
+
+    times.sort()
+    return times[len(times) // 2]
 
 
 N_SLEDS = 100
-N_TIME = 2000
+N_TIME = 200
 
 
 def import_piconumpy_hpy_universal():
@@ -97,18 +102,47 @@ def main():
 
     import piconumpy._piconumpy_cpython_capi as pnp_capi
 
-    t = bench(pnp_capi, N_SLEDS, N_TIME)
-    print(f"CPython C-API:   {t:.2f} seconds")
+    pprint({key: sys.implementation.__dict__[key] for key in ("cache_tag", "version")})
+
+    tmp_result_julia = Path("tmp_result_julia.txt")
+    if tmp_result_julia.exists():
+        with open("tmp_result_julia.txt") as file:
+            norm = float(file.read())
+        end = ""
+        print(f"Julia:           {norm:.3f} seconds")
+    else:
+        norm = False
+        end = "\n"
+
+    t_capi = bench(pnp_capi, N_SLEDS, N_TIME)
+    print(f"CPython C-API:   {t_capi:.3f} seconds", end=end)
+    if norm:
+        print(f" ({t_capi/norm:4.1f} * Julia)")
 
     pnp_hpy_universal = import_piconumpy_hpy_universal()
-    t = bench(pnp_hpy_universal, N_SLEDS, N_TIME)
-    print(f"HPy [Universal]: {t:.2f} seconds")
+    t_hpy_univ = bench(pnp_hpy_universal, N_SLEDS, N_TIME)
+    print(f"HPy [Universal]: {t_hpy_univ:.3f} seconds", end=end)
+
+    if norm:
+        print(f" ({t_hpy_univ/norm:4.1f} * Julia)")
 
     if not IS_PYPY:
         import piconumpy._piconumpy_hpy as pnp_hpy
 
-        t = bench(pnp_hpy, N_SLEDS, N_TIME)
-        print(f"HPy [CPy ABI]:   {t:.2f} seconds")
+        t_hpy_cpy_abi = bench(pnp_hpy, N_SLEDS, N_TIME)
+        print(f"HPy [CPy ABI]:   {t_hpy_cpy_abi:.3f} seconds", end=end)
+
+        if norm:
+            print(f" ({t_hpy_cpy_abi/norm:4.1f} * Julia)")
+
+    if IS_PYPY:
+        import piconumpy.purepy as pnp_with_list
+
+        t_with_list = bench(pnp_with_list, N_SLEDS, N_TIME)
+        print(f"Python list:     {t_with_list:.3f} seconds", end=end)
+
+        if norm:
+            print(f" ({t_with_list/norm:4.1f} * Julia)")
 
 
 if __name__ == "__main__":
diff --git a/bench/make_bench_piconumpy.py b/bench/make_bench_piconumpy.py
index c15b3f6..4a76e9a 100644
--- a/bench/make_bench_piconumpy.py
+++ b/bench/make_bench_piconumpy.py
@@ -47,6 +47,7 @@ def create_tmp_file(name_module):
 import numpy as np
 from piconumpy import array
 from math import pi, cos, sin
+from pprint import pprint
 
 IS_PYPY = hasattr(sys, 'pypy_version_info')
 """
@@ -65,6 +66,8 @@ def create_tmp_file(name_module):
 if not IS_PYPY:
     from tmp_hpy import bench as bench_hpy
 
+pprint({key: sys.implementation.__dict__[key] for key in ("cache_tag", "version")})
+
 # get norm from Julia benchmark
 with open("tmp_result_julia.txt") as file:
     norm = float(file.read())
@@ -75,12 +78,12 @@ def create_tmp_file(name_module):
 name = fmt_name.format("Julia")
 print(f"{name}:     1 * norm = {norm:4.3g} s")
 
-n_sleds = 10
+n_sleds = 100
 n_time = 200
 
 g = locals()
 
-def timeit(name_func, name):
+def timeit(name_func, name, total_duration=2):
     return timeit_verbose(
         name_func + "(n_sleds, n_time)",
         globals=g,
@@ -88,6 +91,7 @@ def timeit(name_func, name):
         print_time=False,
         norm=norm,
         max_length_name=max_length_name,
+        total_duration=total_duration,
     )
 
 timeit("bench", name="PicoNumpy (CPython C-API)")
@@ -95,14 +99,20 @@ def timeit(name_func, name):
     timeit("bench_hpy", name="PicoNumpy (HPy CPy ABI)")
 timeit("bench_hpy_universal", name="PicoNumpy (HPy Universal)")
 timeit("bench_pythran", name="Transonic-Pythran")
-timeit("bench_numpy", name="Numpy")
+try:
+    timeit("bench_numpy", name="Numpy", total_duration=8)
+except RuntimeError:
+    print("Skip bench_numpy because it's too slow")
 timeit(
     "bench_piconumpy_purepy", name="PicoNumpy (purepy)",
 )
 timeit(
     "bench_piconumpy_purepy_array", name="PicoNumpy (purepy_array)",
 )
-timeit("bench_cython", name="PicoNumpy (Cython)")
+try:
+    timeit("bench_cython", name="PicoNumpy (Cython)", total_duration=8)
+except RuntimeError:
+    print("Skip bench_cython because it's too slow")
 """
 )
 
diff --git a/bench/profile_piconumpy.py b/bench/profile_piconumpy.py
index b7de388..3bde5ae 100644
--- a/bench/profile_piconumpy.py
+++ b/bench/profile_piconumpy.py
@@ -7,12 +7,14 @@
 import tmp_purepy
 import tmp_purepy_array
 import tmp_cython
+import tmp_hpy_universal
 
 methods = {
     "cpython-c-api": bench_array1d,
     "purepy": tmp_purepy,
     "purepy_array": tmp_purepy_array,
     "cython": tmp_cython,
+    "universal": tmp_hpy_universal,
 }
 
 module = methods.get(sys.argv[-1], bench_array1d)
diff --git a/piconumpy/test_cpython_capi.py b/piconumpy/test_cpython_capi.py
index a1638dc..cedbed5 100644
--- a/piconumpy/test_cpython_capi.py
+++ b/piconumpy/test_cpython_capi.py
@@ -6,6 +6,7 @@
 
 class Tests:
     piconumpy = _piconumpy_cpython_capi
+
     def _array(self, *args):
         return self.piconumpy.array(*args)
 
diff --git a/piconumpy/test_hpy_universal.py b/piconumpy/test_hpy_universal.py
index fbf5ce4..2a470ca 100644
--- a/piconumpy/test_hpy_universal.py
+++ b/piconumpy/test_hpy_universal.py
@@ -1,3 +1,5 @@
+import sys
+
 import pytest
 
 from .util_hpy import import_ext
@@ -15,3 +17,15 @@
 )
 class TestsCPyABI(_Tests):
     piconumpy = piconumpy_universal
+
+    def test_multiply(self):
+        if sys.implementation.name == "pypy":
+            pytest.xfail("Expected failure with PyPy (but should work)")
+
+        super().test_multiply()
+
+    def test_add(self):
+        if sys.implementation.name == "pypy":
+            pytest.xfail("Expected failure with PyPy (but should work)")
+
+        super().test_add()