PyCUDA¶
Introduction¶
Authors: Andreas Klockner
- PyCUDA can access Nvidia’s CUDA parallel computation API from Python
- Object cleanup tied to lifetime of objects (RAII, Resource Acquisition Is Initialization).
- Makes it much easier to write correct, leak- and crash-free code
- PyCUDA knows about dependencies (e.g.. it won’t detach from a context before all memory allocated in it is also freed)
- Convenience
- Abstractions to compile CUDA code from Python:
pycuda.driver.SourceModule
- A GPU memory buffer: texttt{pycuda.gpuarray.GPUArray}
- Abstractions to compile CUDA code from Python:
- Completeness
- Binding to all of CUDA’s driver API
- Automatic Error Checking
- All CUDA errors are automatically translated into Python exceptions
- Speed
- PyCUDA’s base layer is written in C++
- Helpful documentation
Example¶
import pycuda.autoinit
import pycuda.driver as drv
import numpy
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)
dest = numpy.zeros_like(a)
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b),
block=(400,1,1), grid=(1,1))
assert numpy.allclose(dest, a*b)
print dest
Exercise 6¶
- Run the above example
- Modify and execute it to work for a matrix of 20 x 10
Theano + PyCUDA¶
import numpy, theano
import theano.misc.pycuda_init
from pycuda.compiler import SourceModule
import theano.sandbox.cuda as cuda
class PyCUDADoubleOp(theano.Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, inp):
inp = cuda.basic_ops.gpu_contiguous(
cuda.basic_ops.as_cuda_ndarray_variable(inp))
assert inp.dtype == "float32"
return theano.Apply(self, [inp], [inp.type()])
def make_thunk(self, node, storage_map, _, _2):
mod = SourceModule("""
__global__ void my_fct(float * i0, float * o0, int size) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<size){
o0[i] = i0[i]*2;
}
}""")
pycuda_fct = mod.get_function("my_fct")
inputs = [ storage_map[v] for v in node.inputs]
outputs = [ storage_map[v] for v in node.outputs]
def thunk():
z = outputs[0]
if z[0] is None or z[0].shape!=inputs[0][0].shape:
z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
block=(512,1,1), grid=grid)
return thunk
Test it!
>>> x = theano.tensor.fmatrix() # doctest: +SKIP
>>> f = theano.function([x], PyCUDADoubleOp()(x)) # doctest: +SKIP
>>> xv=numpy.ones((4,5), dtype="float32") # doctest: +SKIP
>>> assert numpy.allclose(f(xv), xv*2) # doctest: +SKIP
>>> print numpy.asarray(f(xv)) # doctest: +SKIP
Exercises 7¶
- Run the above example
- Modify and execute the example to multiple two matrix: x * y
- Modify and execute the example to return 2 outputs: x + y and x - y
- Our current elemwise fusion generate computation with only 1 outputs
- Modify and execute the example to support stride? (Don’t force the input to be c contiguous)