branch: master
external_benchmark_kernel_launch.py
1165 bytesRaw
import time
from tinygrad import Tensor, TinyJit, Device, Context
from tinygrad.helpers import Profiling, Timing, GlobalCounters

# python3 test/test_speed_v_torch.py TestSpeed.test_add_a

@TinyJit
def plus(a:Tensor, b:Tensor): return a+b

if __name__ == "__main__":
  a = Tensor([1]).realize()
  b = Tensor([1]).realize()
  for i in range(5):
    with Timing(prefix=f"{i}:"):
      c = plus(a,b)
      Device[c.device].synchronize()
  assert c.item() == 2
  for i in range(5):
    st = time.perf_counter()
    c = plus(a,b)
    et = time.perf_counter() - st
    Device[c.device].synchronize()
    print(f"nosync  {i}: {et*1e6:.2f} us")
  for i in range(5):
    st = time.perf_counter()
    c = plus(a,b)
    Device[c.device].synchronize()
    et = time.perf_counter() - st
    print(f"precise {i}: {et*1e6:.2f} us")
  assert GlobalCounters.time_sum_s == 0
  with Context(DEBUG=2):
    st = time.perf_counter()
    c = plus(a,b)
    Device[c.device].synchronize()
    et = time.perf_counter() - st
  print(f"kernel {GlobalCounters.time_sum_s*1e3:.2f} ms / full {et*1e3:.2f} ms -- {et/(GlobalCounters.time_sum_s+1e-12):.2f} x")
  with Profiling():
    c = plus(a,b)