10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
17 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
24 template <
typename T,
typename R>
25 __device__ EIGEN_ALWAYS_INLINE
void atomicReduce(T* output, T accum, R& reducer) {
26 #if __CUDA_ARCH__ >= 300
29 unsigned int oldval = *
reinterpret_cast<unsigned int*
>(output);
30 unsigned int newval = oldval;
31 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
32 if (newval == oldval) {
35 unsigned int readback;
36 while ((readback = atomicCAS((
unsigned int*)output, oldval, newval)) != oldval) {
39 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
40 if (newval == oldval) {
45 else if (
sizeof(T) == 8) {
46 unsigned long long oldval = *
reinterpret_cast<unsigned long long*
>(output);
47 unsigned long long newval = oldval;
48 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
49 if (newval == oldval) {
52 unsigned long long readback;
53 while ((readback = atomicCAS((
unsigned long long*)output, oldval, newval)) != oldval) {
56 reducer.reduce(accum, reinterpret_cast<T*>(&newval));
57 if (newval == oldval) {
63 assert(0 &&
"Wordsize not supported");
66 assert(0 &&
"Shouldn't be called on unsupported device");
71 __device__
inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
72 #if __CUDA_ARCH__ >= 300
73 atomicAdd(output, accum);
75 assert(0 &&
"Shouldn't be called on unsupported device");
79 template <
int BlockSize,
int NumPerThread,
typename Self,
80 typename Reducer,
typename Index>
81 __global__
void FullReductionKernel(Reducer reducer,
const Self input, Index num_coeffs,
82 typename Self::CoeffReturnType* output) {
83 const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
85 if (first_index == 0) {
86 *output = reducer.initialize();
89 typename Self::CoeffReturnType accum = reducer.initialize();
90 for (Index i = 0; i < NumPerThread; ++i) {
91 const Index index = first_index + i * BlockSize;
92 if (index >= num_coeffs) {
95 typename Self::CoeffReturnType val = input.m_impl.coeff(index);
96 reducer.reduce(val, &accum);
99 for (
int offset = warpSize/2; offset > 0; offset /= 2) {
100 reducer.reduce(__shfl_down(accum, offset), &accum);
103 if ((threadIdx.x & (warpSize - 1)) == 0) {
104 atomicReduce(output, accum, reducer);
109 template <
typename Self,
typename Op,
bool Vectorizable>
110 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
114 static const bool HasOptimizedImplementation = !Op::IsStateful &&
115 internal::is_same<typename Self::CoeffReturnType, float>::value;
117 template <
typename OutputType>
118 EIGEN_DEVICE_FUNC
static void run(
const Self&
self, Op& reducer,
const GpuDevice& device, OutputType* output) {
119 assert(
false &&
"Should only be called on floats");
122 EIGEN_DEVICE_FUNC
static void run(
const Self&
self, Op& reducer,
const GpuDevice& device,
float* output) {
123 typedef typename Self::Index Index;
125 const Index num_coeffs = array_prod(
self.m_impl.dimensions());
126 const int block_size = 256;
127 const int num_per_thread = 128;
128 const int num_blocks = std::ceil(static_cast<float>(num_coeffs) / (block_size * num_per_thread));
129 LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread>),
130 num_blocks, block_size, 0, device, reducer,
self, num_coeffs, output);
140 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
Namespace containing all symbols from the Eigen library.
Definition: CXX11Meta.h:13