Technology & AI

A Datashader coding tutorial for rendering large datasets with High-Performance Python Visual Analytics

import subprocess, sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
                      "datashader", "colorcet", "numba", "scipy"])


import numpy  as np
import pandas as pd
import datashader as ds
import datashader.transfer_functions as tf
from datashader import reductions as rd
import colorcet as cc
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.gridspec import GridSpec
from scipy.stats import multivariate_normal
import time, warnings
warnings.filterwarnings("ignore")


print("Datashader version:", ds.__version__)


def show(img, title="", ax=None, figsize=(6, 5)):
   standalone = ax is None
   if standalone:
       fig, ax = plt.subplots(figsize=figsize)
   rgba = img.to_pil()
   ax.imshow(rgba, origin="upper", aspect="auto")
   ax.set_title(title, fontsize=11, fontweight="bold")
   ax.axis("off")
   if standalone:
       plt.tight_layout()
       plt.show()


print("n=== SECTION 1: Core Pipeline ===")


rng = np.random.default_rng(42)
N   = 2_000_000


x = np.concatenate([rng.normal(-1, 0.5, N//3),
                   rng.normal( 1, 0.5, N//3),
                   rng.normal( 0, 1.5, N//3)])
y = np.concatenate([rng.normal(-1, 0.5, N//3),
                   rng.normal( 1, 0.5, N//3),
                   rng.normal( 0, 0.5, N//3)])
df_base = pd.DataFrame({"x": x, "y": y})


canvas = ds.Canvas(plot_width=600, plot_height=500,
                  x_range=(-4, 4), y_range=(-4, 4))


agg = canvas.points(df_base, "x", "y", agg=rd.count())


fig, axes = plt.subplots(1, 3, figsize=(15, 4))
combos = [
   ("Linear / blues",  tf.shade(agg, cmap=cc.blues,        how="linear")),
   ("Log    / fire",   tf.shade(agg, cmap=cc.fire,         how="log"   )),
   ("Eq-hist / bmy",   tf.shade(agg, cmap=cc.bmy,          how="eq_hist")),
]
for ax, (title, img) in zip(axes, combos):
   show(img, title, ax=ax)
plt.suptitle("Section 1 – 2 M points: Linear vs Log vs Eq-Hist normalisation",
            fontsize=13, fontweight="bold")
plt.tight_layout()
plt.show()


print("n=== SECTION 2: Reduction Types ===")


n_actual = len(df_base)
df_base["value"] = rng.exponential(scale=2, size=n_actual)
df_base["label"] = pd.Categorical(
   rng.choice(["A", "B", "C"], size=n_actual),
   categories=["A", "B", "C"]
)


canvas2 = ds.Canvas(plot_width=400, plot_height=350,
                   x_range=(-4, 4), y_range=(-4, 4))


reductions_cfg = [
   ("count()",          rd.count(),                 cc.kbc),
   ("sum(value)",       rd.sum("value"),             cc.CET_L3),
   ("mean(value)",      rd.mean("value"),            cc.CET_D4),
   ("std(value)",       rd.std("value"),             cc.CET_L16),
   ("min(value)",       rd.min("value"),             cc.CET_L17),
   ("max(value)",       rd.max("value"),             cc.bgyw),
   ("var(value)",       rd.var("value"),             cc.CET_L18),
   ("count_cat(label)", rd.count_cat("label"),       None),
]


fig, axes = plt.subplots(2, 4, figsize=(18, 9))
axes = axes.flat


for ax, (name, agg_fn, cmap) in zip(axes, reductions_cfg):
   agg_r = canvas2.points(df_base, "x", "y", agg=agg_fn)
   if cmap is None:
       img = tf.shade(agg_r, color_key={"A":"#e41a1c","B":"#377eb8","C":"#4daf4a"})
   else:
       img = tf.shade(agg_r, cmap=cmap, how="eq_hist")
   show(img, name, ax=ax)


plt.suptitle("Section 2 – All Reduction Types on 2 M points", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.show()


print("n=== SECTION 3: Categorical Visualisation ===")


N_cat = 500_000
categories = ["Cluster A", "Cluster B", "Cluster C", "Cluster D"]
centers = [(-2, -2), (-2, 2), (2, -2), (2, 2)]
colors  = {"Cluster A":"#e41a1c","Cluster B":"#377eb8",
          "Cluster C":"#4daf4a","Cluster D":"#ff7f00"}


frames = []
for cat, (cx, cy) in zip(categories, centers):
   n = N_cat // len(categories)
   frames.append(pd.DataFrame({
       "x":    rng.normal(cx, 0.8, n),
       "y":    rng.normal(cy, 0.8, n),
       "cat":  pd.Categorical([cat]*n, categories=categories),
   }))
df_cat = pd.concat(frames, ignore_index=True)


canvas3 = ds.Canvas(plot_width=500, plot_height=500,
                   x_range=(-5, 5), y_range=(-5, 5))
agg_cat = canvas3.points(df_cat, "x", "y", agg=rd.count_cat("cat"))


fig, axes = plt.subplots(1, 3, figsize=(16, 5))


img_raw  = tf.shade(agg_cat, color_key=colors)
show(img_raw, "Raw (no spread)", ax=axes[0])


img_sp1  = tf.spread(tf.shade(agg_cat, color_key=colors), px=1)
show(img_sp1, "Spread px=1", ax=axes[1])


img_bg   = tf.set_background(tf.shade(agg_cat, color_key=colors), color="black")
show(img_bg, "Black background", ax=axes[2])


for cat, col in colors.items():
   axes[2].plot([], [], "o", color=col, label=cat, markersize=8)
axes[2].legend(loc="lower right", fontsize=8, framealpha=0.6)


plt.suptitle("Section 3 – Categorical Rendering (500 k points)", fontsize=13, fontweight="bold")
plt.tight_layout()
plt.show()

Related Articles

Leave a Reply

Your email address will not be published. Required fields are marked *

Back to top button