Iterating over masssive amounts of data can be extremely slow in Python. Rewriting performance-critical code in C can improve performance by over 1000x and is easy to do.

Suppose we have an image transformation algorithm that takes an image as input and produces an image as output.

import time
import PIL.Image
import numpy as np

def transform(data, result):
    shape = np.shape(data)
    data = data / 255
    for y in range(1, shape[0]-1):
        for x in range(1, shape[1]-1):
            v = data[y-1][x-1] * data[y-1][x] * data[y-1][x+1] \
                    * data[y][x-1] * data[y][x] * data[y][x+1] \
                    * data[y+1][x-1] * data[y+1][x] * data[y+1][x+1]
            d = np.linalg.norm(v)
            v = 0 if d == 0.0 else v / d
            result[y][x] = np.minimum(v, np.ones(shape[2]))
    result *= 255
    return result.astype(np.uint8)

def main():
    image = PIL.Image.open('test.png')
    data = np.array(image)
    result = np.empty(np.shape(data))
    output = PIL.Image.fromarray(transform(data, result))
    output.save('out.png')

if __name__ == '__main__':
    main()

This code transforms each pixel value into the unit interval by dividing it by 255, multiplies it with the values of all neighbor pixels, normalizes the result and transforms it back into the RGB format by multiplying with 255.

The input image shall be a watermelon pizza.

And the following is the image produced by our algorithm.

A quick benchmark reveals that transforming one image with this algorithm takes over 9 seconds.

> t0 = time.time()
> n = 10
> for _ in range(n):
>     main()
> print((time.time() - t0) / n)
9.202898073196412

Way too slow. Let’s do some profiling.

> python -m cProfile -s time -o pixel.prof pixel.py
> python -c 'import pstats; p = pstats.Stats("pixel.prof"); p.sort_stats("time").print_stats(4)'
Thu Sep 10 10:41:33 2020    pixel.prof

         9214391 function calls (8705887 primitive calls) in 10.604 seconds

   Ordered by: internal time
   List reduced from 1489 to 4 due to restriction <4>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    5.586    5.586   10.329   10.329 pixel.py:5(transform)
   505284    1.359    0.000    2.737    0.000 /usr/lib/python3/dist-packages/numpy/linalg/linalg.py:2325(norm)
1515856/1010572    1.177    0.000    3.460    0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
   505286    0.371    0.000    0.371    0.000 {built-in method numpy.empty}

The profiling output reveals that most time is spent indeed in the transform function. If you prefer a more graphical output, this can be achieved with gprofdot.

> gprof2dot -f pstats pixel.prof | dot -Tpng -o prof.png

This produces the following image.

Clearly, the transform function is the problem. It is quickly rewritten in C.

#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION

#include <Python.h>
#include <numpy/arrayobject.h>
#include <math.h>
#include <stdbool.h>

inline void set_value(PyArrayObject *data, int y, int x, int z, uint8_t v) {
    uint8_t *p = (uint8_t*) PyArray_GETPTR3(data, y, x, z);
    *p = v;
}

inline uint8_t get_value(PyArrayObject *data, int y, int x, int z) {
    return *((uint8_t*) PyArray_GETPTR3(data, y, x, z));
}

inline void normalize(double *v) {
    double imag = 1.0 / sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
    v[0] *= imag;
    v[1] *= imag;
    v[2] *= imag;
}

static PyObject* transform(PyObject* self, PyObject *args) {
    PyArrayObject *arrays[2];
    if (!PyArg_ParseTuple(args, "O!O!", &PyArray_Type, &arrays[0], &PyArray_Type, &arrays[1])) {
        return NULL;
    }

    int neighbors[][2] = {{-1, -1}, {-1, 0}, {-1, 1},
                          {0, -1}, {0, 0}, {0, 1},
                          {1, -1}, {1, 0}, {1, 1}};

    long int *shape = PyArray_SHAPE(arrays[0]);
    int width=shape[1], height=shape[0];

#pragma omp parallel for
    for (int y=1; y<height-1; y++) {
        for (int x=1; x<width-1; x++) {
            double v[] = {1.0, 1.0, 1.0};

            for (int z=0; z<3; z++) {
                for(int i=0; i<9; i++) {
                    v[z] *= get_value(arrays[0], y + neighbors[i][0], x + neighbors[i][1], z) / 255.0;
                }
            }

            normalize(v);

            for (int z=0; z<3; z++) {
                if (v[z] > 1.0)
                    v[z] = 1.0;
                set_value(arrays[1], y, x, z, v[z] * 255.0);
            };
            set_value(arrays[1], y, x, 3, 255);
        }
    }

    Py_RETURN_NONE;
}

/*  define functions in module */
static PyMethodDef PixelMethods[] =
{
     {"transform", transform, METH_VARARGS,
         "transform image and write to result array"},
     {NULL, NULL, 0, NULL}
};

/* module initialization */
static struct PyModuleDef cModPyDem = {
    PyModuleDef_HEAD_INIT,
    "_pixel", "transform image",
    -1,
    PixelMethods
};
PyMODINIT_FUNC PyInit__pixel(void) {
    PyObject *module;
    module = PyModule_Create(&cModPyDem);
    if(module==NULL) return NULL;
    /* IMPORTANT: this must be called */
    import_array();
    if (PyErr_Occurred()) return NULL;
    return module;
}

To build this C Extension create a setup.py.

from setuptools import setup, Extension
import numpy

setup(
    name='pixel',
    version='0.1.0',
    author='Kris',
    author_email='31852063+krisfris@users.noreply.github.com',
    description='',
    long_description='',
    ext_modules=[Extension('_pixel', sources=['pixel.c'],
                           include_dirs=[numpy.get_include()],
                           extra_compile_args=['-fopenmp', '-Ofast'],
                           extra_link_args=['-lgomp'])],
    zip_safe=False,
)

Then build and install your package.

> python setup.py build
> python setup.py install --user

Or…

> python -m pip install .

If you use poetry, poetry install will automatically build your package and add it to the virtualenv.

Finally, change the main function to use the C Extension.

import _pixel

def main():
    image = PIL.Image.open('test.png')
    data = np.array(image)
    result = np.empty(np.shape(data), dtype=np.uint8)
    _pixel.transform(data, result)
    output = PIL.Image.fromarray(result)
    output.save('out.png')

Do another benchmark.

> t0 = time.time()
> n = 10
> for _ in range(n):
>     main()
> print((time.time() - t0) / n)
0.14392392635345458

0.14 seconds. That’s about 64x faster than the Python loop.