Technology & AI
NVIDIA CuTile Python Tutorial: Building Tiled GPU Kernels for Vector Addition, Matrix Addition, and Matrix Multiplication in Colab

print("n" + "=" * 90)
print("[5] cuTile kernels are defined only if cuda.tile imports successfully")
print("=" * 90)
if cutile_import_ok:
ConstInt = ct.Constant[int]
@ct.kernel
def cutile_vec_add_direct_kernel(a, b, c, TILE: ConstInt):
bid = ct.bid(0)
a_tile = ct.load(a, index=(bid,), shape=(TILE,))
b_tile = ct.load(b, index=(bid,), shape=(TILE,))
c_tile = a_tile + b_tile
ct.store(c, index=(bid,), tile=c_tile)
@ct.kernel
def cutile_vec_add_gather_kernel(a, b, c, TILE: ConstInt):
bid = ct.bid(0)
offsets = bid * TILE + ct.arange(TILE, dtype=torch.int32)
a_tile = ct.gather(a, offsets)
b_tile = ct.gather(b, offsets)
c_tile = a_tile + b_tile
ct.scatter(c, offsets, c_tile)
@ct.kernel
def cutile_matrix_add_gather_kernel(a, b, c, TILE_M: ConstInt, TILE_N: ConstInt):
bid_m = ct.bid(0)
bid_n = ct.bid(1)
rows = bid_m * TILE_M + ct.arange(TILE_M, dtype=torch.int32)
cols = bid_n * TILE_N + ct.arange(TILE_N, dtype=torch.int32)
rows = rows[:, None]
cols = cols[None, :]
a_tile = ct.gather(a, (rows, cols))
b_tile = ct.gather(b, (rows, cols))
c_tile = a_tile + b_tile
ct.scatter(c, (rows, cols), c_tile)
@ct.kernel
def cutile_matmul_kernel(A, B, C, TM: ConstInt, TN: ConstInt, TK: ConstInt):
bid_m = ct.bid(0)
bid_n = ct.bid(1)
num_tiles_k = ct.num_tiles(A, axis=1, shape=(TM, TK))
acc = ct.full((TM, TN), 0, dtype=ct.float32)
zero_pad = ct.PaddingMode.ZERO
compute_dtype = ct.tfloat32 if A.dtype == ct.float32 else A.dtype
for k in range(num_tiles_k):
a_tile = ct.load(
A,
index=(bid_m, k),
shape=(TM, TK),
padding_mode=zero_pad
).astype(compute_dtype)
b_tile = ct.load(
B,
index=(k, bid_n),
shape=(TK, TN),
padding_mode=zero_pad
).astype(compute_dtype)
acc = ct.mma(a_tile, b_tile, acc)
out = ct.astype(acc, C.dtype)
ct.store(C, index=(bid_m, bid_n), tile=out)
else:
print("Skipping cuTile kernel definitions because cuda.tile is unavailable.")
print("n" + "=" * 90)
print("[6] High-level wrappers")
print("=" * 90)
def vec_add_tutorial(a, b, use_gather=True):
if a.shape != b.shape:
if likely_runtime_ok and a.is_cuda:
c = torch.empty_like(a)
TILE = 256 if use_gather else min(1024, 2 ** math.ceil(math.log2(a.numel())))
grid = (math.ceil(a.numel() / TILE), 1, 1)
kernel = cutile_vec_add_gather_kernel if use_gather else cutile_vec_add_direct_kernel
ct.launch(torch.cuda.current_stream(), grid, kernel, (a, b, c, TILE))
return c
return a + b
def matrix_add_tutorial(a, b):
if a.shape != b.shape:
if likely_runtime_ok and a.is_cuda:
c = torch.empty_like(a)
TILE_M = 16
TILE_N = 64
grid = (math.ceil(a.shape[0] / TILE_M), math.ceil(a.shape[1] / TILE_N), 1)
ct.launch(
torch.cuda.current_stream(),
grid,
cutile_matrix_add_gather_kernel,
(a, b, c, TILE_M, TILE_N)
)
return c
return a + b
def matmul_tutorial(A, B):
if A.shape[1] != B.shape[0]:
raise ValueError("A.shape[1] must equal B.shape[0]")
if likely_runtime_ok and A.is_cuda:
if A.dtype in (torch.float16, torch.bfloat16):
TM, TN, TK = 128, 128, 64
else:
TM, TN, TK = 32, 32, 32
C = torch.empty((A.shape[0], B.shape[1]), device=A.device, dtype=A.dtype)
grid = (math.ceil(A.shape[0] / TM), math.ceil(B.shape[1] / TN), 1)
ct.launch(
torch.cuda.current_stream(),
grid,
cutile_matmul_kernel,
(A, B, C, TM, TN, TK)
)
return C
return A @ B
print("Wrappers ready.")
print(f"Execution backend: {'cuTile' if likely_runtime_ok else 'PyTorch fallback'}")


