/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/host/rotating_buffers.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/host/rotating_buffers.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/develop/include/ck_tile/host/rotating_buffers.hpp Source File
rotating_buffers.hpp
Go to the documentation of this file.
1 // SPDX-License-Identifier: MIT
2 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
3 
4 #pragma once
5 
8 #include <hip/hip_runtime.h>
9 
10 namespace ck_tile {
11 
12 // RotatingMemWrapper: Prevents GPU data cache reuse during kernel benchmarking.
13 //
14 // Purpose:
15 // When benchmarking a kernel repeatedly with the same input buffers, the GPU L2 cache
16 // will serve data from cache (hot) instead of HBM (cold), leading to artificially fast
17 // timing measurements. This wrapper rotates through multiple copies of buffers at different
18 // memory addresses to force cache misses.
19 //
20 // How it works:
21 // Constructor: Creates rotating_count copies of matrices A and B in GPU memory
22 // Next(): Switches pointers to the next buffer copy (cycles through all copies)
23 // Destructor: Frees extra buffer copies and restores original pointers
24 //
25 // Combined with flush_icache(), this ensures realistic "cold cache" performance measurements.
26 template <typename ADataType, typename BDataType>
28 {
29  RotatingMemWrapper() = delete;
30  RotatingMemWrapper(const void* a_ptr_,
31  const void* b_ptr_,
32  std::size_t rotating_count_hint,
33  std::size_t size_a_,
34  std::size_t size_b_)
35  : a_ptr(a_ptr_),
36  b_ptr(b_ptr_),
37  rotating_count(rotating_count_hint),
38  size_a(size_a_),
39  size_b(size_b_)
40  {
41  // Store original buffer pointers as first entry
42  p_a_grids.push_back(a_ptr);
43  p_b_grids.push_back(b_ptr);
44 
45  // limit the rotating count to prevent oom
46  const uint64_t footprint = (size_a + size_b);
47  const uint64_t max_rotating_count = (1ULL << 31) / footprint;
48  rotating_count = std::min(rotating_count, max_rotating_count);
49 
50  // Create (rotating_count - 1) additional copies at different memory addresses
51  for(size_t i = 1; i < rotating_count; i++)
52  {
53  {
54  void* pADeviceBuf;
55  HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
56  HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf), // target buffer
57  const_cast<void*>(p_a_grids[0]), // source buffer
58  size_a_,
59  hipMemcpyDeviceToDevice));
60  p_a_grids.push_back(pADeviceBuf);
61  }
62 
63  {
64  void* pBDeviceBuf;
65  HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
66  HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf), // target buffer
67  const_cast<void*>(p_b_grids[0]), // source buffer
68  size_b_,
69  hipMemcpyDeviceToDevice));
70  p_b_grids.push_back(pBDeviceBuf);
71  }
72  }
73  }
74  // Rotate to the next buffer copy. Call this before each kernel run to use different
75  // memory addresses, forcing the GPU to fetch data from HBM instead of cache.
76  void Next()
77  {
78  if(rotating_count > 1)
79  {
80  std::size_t idx = iter++ % rotating_count; // Cycle through all buffer copies
81  a_ptr = p_a_grids[idx];
82  b_ptr = p_b_grids[idx];
83  }
84  }
85  void Print()
86  {
87  std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
88  << ", rotating_count: " << rotating_count << "}" << std::endl;
89  }
90  // Cleanup: Free all extra buffer copies (keeping original) and restore original pointers
92  {
93  if(rotating_count > 1)
94  {
95  // Restore original buffer pointers
96  a_ptr = p_a_grids[0];
97  b_ptr = p_b_grids[0];
98 
99  // Free extra buffer copies (index 0 is the original, don't free it)
100  for(size_t i = 1; i < rotating_count; i++)
101  {
102  ck_tile::hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
103  ck_tile::hip_check_error(hipFree(const_cast<void*>(p_b_grids[i])));
104  }
105  }
106  }
107 
108  private:
109  const void* a_ptr;
110  const void* b_ptr;
111  std::size_t iter = 0;
112  std::size_t rotating_count = 1;
113  std::size_t size_a = 0;
114  std::size_t size_b = 0;
115  std::vector<const void*> p_a_grids;
116  std::vector<const void*> p_b_grids;
117 };
118 inline void flush_icache()
119 {
120  hipDeviceProp_t deviceProps;
121  HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
122 
123  // Over-provision blocks to ensure all CUs execute the flush instruction.
124  // With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage.
125  // 60x over-provisioning provides statistical certainty that every CU gets at least one block.
126  constexpr int32_t blocks_per_cu = 60;
127  int32_t gpu_block3 = deviceProps.multiProcessorCount * blocks_per_cu;
128 
129  ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
130  HIP_CHECK_ERROR(hipGetLastError());
131 }
132 } // namespace ck_tile
#define HIP_CHECK_ERROR(retval_or_funcall)
Definition: hip_check_error.hpp:21
__host__ constexpr __device__ T min(T x)
Definition: math.hpp:116
Definition: cluster_descriptor.hpp:13
CK_TILE_HOST void hip_check_error(hipError_t x)
Definition: hip_check_error.hpp:13
int32_t int32_t
Definition: integer.hpp:10
void flush_icache()
Definition: rotating_buffers.hpp:118
unsigned __int64 uint64_t
Definition: stdint.h:136
Definition: rotating_buffers.hpp:28
void Print()
Definition: rotating_buffers.hpp:85
void Next()
Definition: rotating_buffers.hpp:76
RotatingMemWrapper(const void *a_ptr_, const void *b_ptr_, std::size_t rotating_count_hint, std::size_t size_a_, std::size_t size_b_)
Definition: rotating_buffers.hpp:30
~RotatingMemWrapper() noexcept
Definition: rotating_buffers.hpp:91