numba is a just-in-time (JIT) compiler for Python. With a few simple annotations, array-oriented and math-heavy Python code can be just-in-time optimized to performance similar as C, C++ and Fortran, without having to switch languages or Python interpreters.
Press Spacebar
to go to the next slide (or ?
to see all navigation shortcuts)
Lunch Time Python, Scientific Software Center, Heidelberg University
conda install numba
python -m pip install numba
Toy example: implement a vector reduction operation:
r(x,y) = $ \sum_i \cos(x_i) \sin(y_i) $
Some random vectors to benchmark our functions:
import numpy as np
x = np.random.uniform(low=-1, high=1, size=5000000)
y = np.random.uniform(low=-1, high=1, size=5000000)
import math
def r_python(x_vec, y_vec):
s = 0
for x, y in zip(x_vec, y_vec):
s += math.cos(x) * math.sin(y)
return s
r_python(x, y)
1088.7818529133356
%timeit r_python(x,y)
1.28 s ± 9.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
def r_numpy(x_vec, y_vec):
return np.dot(np.cos(x_vec), np.sin(y_vec))
r_numpy(x, y)
1088.7818529133306
%timeit r_numpy(x,y)
133 ms ± 221 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# pip install cython
%load_ext cython
%%cython
import math
def r_cython(x_vec, y_vec):
s = 0
for x,y in zip(x_vec, y_vec):
s += math.cos(x) * math.sin(y)
return s
r_cython(x, y)
1088.7818529133356
%timeit r_cython(x,y)
973 ms ± 14.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%cython
import math
# use C math functions
from libc.math cimport sin, cos
# use C types instead of Python types
def r_cython(double[:] x_vec, double[:] y_vec):
cdef double s = 0
cdef int i
for i in range(len(x_vec)):
s += cos(x_vec[i])*sin(y_vec[i])
return s
r_cython(x, y)
1088.7818529133356
%timeit r_cython(x,y)
102 ms ± 127 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
if "google.colab" in str(get_ipython()):
!pip install fortran-magic -qqq
%load_ext fortranmagic
%%fortran
subroutine r_fortran(x_vec, y_vec, res)
real, intent(in) :: x_vec(:), y_vec(:)
real, intent(out) :: res
integer :: i, n
n = size(x_vec)
res = 0
do i=1,n
res = res + cos(x_vec(i))*sin(y_vec(i))
enddo
endsubroutine r_fortran
r_fortran(x, y)
1088.786376953125
%timeit r_fortran(x,y)
62.1 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
if "google.colab" in str(get_ipython()):
!pip install git+https://github.com/aldanor/ipybind.git -qqq
%load_ext ipybind
%%pybind11
#include <pybind11/numpy.h>
#include <math.h>
PYBIND11_PLUGIN(example) {
py::module m("example");
m.def("r_pybind", [](const py::array_t<double>& x, const py::array_t<double>& y) {
double sum{0};
auto rx{x.unchecked<1>()};
auto ry{y.unchecked<1>()};
for (py::ssize_t i = 0; i < rx.shape(0); i++){
sum += std::cos(rx[i])*std::sin(ry[i]);
}
return sum;
});
return m.ptr();
}
r_pybind(x, y)
1088.7818529133356
%timeit r_pybind(x, y)
98.8 ms ± 77.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
from numba import jit
@jit
def r_numba(x_vec, y_vec):
s = 0
for x, y in zip(x_vec, y_vec):
s += math.cos(x) * math.sin(y)
return s
r_numba(x, y)
1088.7818529133356
# pure python with numba JIT
%timeit r_numba(x,y)
101 ms ± 342 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Two compilation modes
nopython
mode (default)object
mode (fallback)nopython
mode is not possiblenopython=True
or use @njit
You can optionally explicitly specify the function signature. Use cases:
from numba import float32
@jit(float32(float32, float32))
def sum(a, b):
return a + b
sum(1, 0.99999999)
2.0
nopython=True
disable Object mode fallbacknogil=True
release the Python Global Interpreter Lock (GIL)cache=True
cache the compiled funtions on diskparallel=True
enable automatic parallelizationparallel=True
option to enableprange
to explicitly parallelize a loop over a range
from numba import jit, prange
@jit(parallel=True)
def r_numba(x_vec, y_vec):
s = 0
for i in prange(len(x_vec)):
s += math.cos(x[i]) * math.sin(y[i])
return s
r_numba(x, y)
1088.7818529133433
%timeit r_numba(x,y)
34.4 ms ± 683 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
ufunc
is a function that operates on scalars@numba.vectorize
and use it like built-in numpy ufuncsfrom numba import vectorize, float64
@vectorize([float64(float64, float64)], target="parallel")
def r(x, y):
return np.cos(x) * np.sin(y)
r(2, 3)
-0.05872664492762098
r(x, y)
array([ 0.52823121, -0.11269745, -0.4178665 , ..., 0.25507448, 0.72928639, -0.09443755])
np.sum(r(x, y))
1088.7818529133344
%timeit np.sum(r(x,y))
55.9 ms ± 587 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
@generated_jit
decorator for compile-time logic, e.g. type specializations@stencil
decorator for creating a stencil to apply to an array@cfunc
decorator to generate a C-callback (e.g. to pass to scipy.integrate)