Fabcoin Core  0.16.2
P2P Digital Currency
libclwrapper.h
Go to the documentation of this file.
1 #pragma once
2 
3 #define __CL_ENABLE_EXCEPTIONS
4 #define CL_USE_DEPRECATED_OPENCL_2_0_APIS
5 
6 // Just for Hello World Kernel
7 //#define DATA_SIZE 100
8 
9 #if defined(__clang__)
10 #pragma clang diagnostic push
11 #pragma clang diagnostic ignored "-Wunused-parameter"
12 #include "cl.hpp"
13 #pragma clang diagnostic pop
14 #elif defined(__GNUC__)
15 #pragma GCC diagnostic push
16 #pragma GCC diagnostic ignored "-Wstack-protector"
17 #include "cl.hpp"
18 #pragma GCC diagnostic pop
19 #else
20 #include "cl.hpp"
21 #endif
22 #include <time.h>
23 #include <functional>
24 
25 #include "sodium.h"
26 
27 typedef uint8_t uchar;
28 typedef uint32_t uint;
29 typedef uint64_t ulong;
30 
31 #include "param.h"
32 #include "blake.h"
33 #include <cassert>
34 #include "uint256.h"
35 
36 #define EQUIHASH_N 200
37 #define EQUIHASH_K 9
38 
39 #define NUM_COLLISION_BITS (EQUIHASH_N / (EQUIHASH_K + 1))
40 #define NUM_INDICES (1 << EQUIHASH_K)
41 
42 #define NUM_VALUES (1 << (NUM_COLLISION_BITS+1))
43 #define NUM_BUCKETS (1 << NUM_COLLISION_BITS)
44 #define DIGEST_SIZE 25
45 
46 typedef struct element element_t;
47 typedef uint64_t digest_t[(DIGEST_SIZE + sizeof(uint64_t) - 1) / sizeof(uint64_t)];
48 
49 struct element {
50  uint32_t digest_index;
51  uint32_t parent_bucket_index;
52 };
53 
54 
55 typedef struct bucket {
56  unsigned size;
57  element_t data[18];
58 } bucket_t;
59 
60 typedef struct debug_s
61 {
62  uint32_t dropped_coll;
63  uint32_t dropped_stor;
64 } debug_t;
65 
66 typedef uint32_t eh_index;
67 
68 class cl_gpuminer
69 {
70 
71 public:
72 
73  cl_gpuminer();
74  ~cl_gpuminer();
75 
76  static bool searchForAllDevices(unsigned _platformId, std::function<bool(cl::Device const&)> _callback);
77  static bool searchForAllDevices(std::function<bool(cl::Device const&)> _callback);
78  static void doForAllDevices(unsigned _platformId, std::function<void(cl::Device const&)> _callback);
79  static void doForAllDevices(std::function<void(cl::Device const&)> _callback);
80  static unsigned getNumPlatforms();
81  static unsigned getNumDevices(unsigned _platformId = 0);
82  static std::string platform_info(unsigned _platformId = 0, unsigned _deviceId = 0);
83  static std::vector<cl::Device> getDevices(std::vector<cl::Platform> const& _platforms, unsigned _platformId);
84  static std::vector<cl::Platform> getPlatforms();
85  static void listDevices();
86 
87  // Currently just prints memory of the GPU
88  static bool configureGPU(
89  unsigned _platformId,
90  unsigned _localWorkSize,
91  unsigned _globalWorkSize
92  );
93 
94  bool init(
95  unsigned _platformId,
96  unsigned _deviceId,
97  std::vector<std::string> _kernels
98  );
99 
100  void run(uint8_t *header, size_t header_len, uint256 nonce, sols_t * indices, uint32_t * n_sol, uint256 * ptr);
101 
102  void finish();
103 
104  /* -- default values -- */
106  static unsigned const c_defaultLocalWorkSize;
108  static unsigned const c_defaultGlobalWorkSizeMultiplier;
110  static unsigned const c_defaultMSPerBatch;
111 
112 private:
113  static const unsigned int z_n = 200;
114  static const unsigned int z_k = 9;
115  static const size_t z_collision_bit_length = z_n / (z_k + 1);
116  static const eh_index z_N = 1 << (z_collision_bit_length + 1);
117 
118  int compare_indices32(uint32_t* a, uint32_t* b, size_t n_current_indices) {
119  for(size_t i = 0; i < n_current_indices; ++i, ++a, ++b) {
120  if(*a < *b) {
121  return -1;
122  } else if(*a > *b) {
123  return 1;
124  } else {
125  return 0;
126  }
127  }
128  return 0;
129  }
130  void normalize_indices(uint32_t* indices) {
131  for(size_t step_index = 0; step_index < EQUIHASH_K; ++step_index) {
132  for(size_t i = 0; i < NUM_INDICES; i += (1 << (step_index+1))) {
133  if(compare_indices32(indices+i, indices+i+(1 << step_index), (1 << step_index)) > 0) {
134  uint32_t tmp_indices[(1 << step_index)];
135  memcpy(tmp_indices, indices+i, (1 << step_index)*sizeof(uint32_t));
136  memcpy(indices+i, indices+i+(1 << step_index), (1 << step_index)*sizeof(uint32_t));
137  memcpy(indices+i+(1 << step_index), tmp_indices, (1 << step_index)*sizeof(uint32_t));
138  }
139  }
140  }
141  }
142  char *s_hexdump(const void *_a, uint32_t a_len)
143  {
144  const uint8_t *a = (const uint8_t *) _a;
145  static char buf[1024];
146  uint32_t i;
147  for (i = 0; i < a_len && i + 2 < sizeof (buf); i++)
148  sprintf(buf + i * 2, "%02x", a[i]);
149  buf[i * 2] = 0;
150  return buf;
151  }
153  {
154  size_t work_size =
155  64 * /* thread per wavefront */
156  BLAKE_WPS * /* wavefront per simd */
157  4 * /* simd per compute unit */
158  36;
159  // Make the work group size a multiple of the nr of wavefronts, while
160  // dividing the number of inputs. This results in the worksize being a
161  // power of 2.
162  while (NR_INPUTS % work_size)
163  work_size += 64;
164  //debug("Blake: work size %zd\n", work_size);
165  return work_size;
166  }
167  void sort_pair(uint32_t *a, uint32_t len)
168  {
169  uint32_t *b = a + len;
170  uint32_t tmp, need_sorting = 0;
171  for (uint32_t i = 0; i < len; i++)
172  if (need_sorting || a[i] > b[i])
173  {
174  need_sorting = 1;
175  tmp = a[i];
176  a[i] = b[i];
177  b[i] = tmp;
178  }
179  else if (a[i] < b[i])
180  return ;
181  }
182 
183  uint32_t verify_sol(sols_t *sols, unsigned sol_i) {
184  uint32_t *inputs = sols->values[sol_i];
185  uint32_t seen_len = (1 << (PREFIX + 1)) / 8;
186  uint8_t seen[seen_len];
187  uint32_t i;
188  uint8_t tmp;
189  // look for duplicate inputs
190  memset(seen, 0, seen_len);
191  for (i = 0; i < (1 << PARAM_K); i++)
192  {
193  tmp = seen[inputs[i] / 8];
194  seen[inputs[i] / 8] |= 1 << (inputs[i] & 7);
195  if (tmp == seen[inputs[i] / 8])
196  {
197  // at least one input value is a duplicate
198  sols->valid[sol_i] = 0;
199  return 0;
200  }
201  }
202  // the valid flag is already set by the GPU, but set it again because
203  // I plan to change the GPU code to not set it
204  sols->valid[sol_i] = 1;
205  // sort the pairs in place
206  for (uint32_t level = 0; level < PARAM_K; level++)
207  for (i = 0; i < (1 << PARAM_K); i += (2 << level))
208  sort_pair(&inputs[i], 1 << level);
209  return 1;
210  }
211  cl::Context m_context;
213  std::vector<cl::Kernel> m_gpuKernels;
214  cl::Buffer buf_ht[2];
215  cl::Buffer buf_sols;
216  cl::Buffer buf_dbg;
217  cl::Buffer rowCounters[2];
218 
219  uint64_t nonce;
220  uint64_t total;
221  size_t dbg_size = 1 * sizeof (debug_t);
222 
223  const cl_int zero = 0;
224  uint32_t solutions;
225  uint32_t * dst_solutions;
226 
227  unsigned m_globalWorkSize;
228  bool m_openclOnePointOne;
229  unsigned m_deviceBits;
230 
232  unsigned int m_stepWorkSizeAdjust;
234  int m_wayWorkSizeAdjust = 0;
235 
237  static unsigned s_workgroupSize;
239  static unsigned s_initialGlobalWorkSize;
241  static unsigned s_msPerBatch;
243  static bool s_allowCPU;
246  static unsigned s_extraRequiredGPUMem;
247 
248  const char *get_error_string(cl_int error)
249  {
250  switch(error){
251  // run-time and JIT compiler errors
252  case 0: return "CL_SUCCESS";
253  case -1: return "CL_DEVICE_NOT_FOUND";
254  case -2: return "CL_DEVICE_NOT_AVAILABLE";
255  case -3: return "CL_COMPILER_NOT_AVAILABLE";
256  case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
257  case -5: return "CL_OUT_OF_RESOURCES";
258  case -6: return "CL_OUT_OF_HOST_MEMORY";
259  case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
260  case -8: return "CL_MEM_COPY_OVERLAP";
261  case -9: return "CL_IMAGE_FORMAT_MISMATCH";
262  case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
263  case -11: return "CL_BUILD_PROGRAM_FAILURE";
264  case -12: return "CL_MAP_FAILURE";
265  case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
266  case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
267  case -15: return "CL_COMPILE_PROGRAM_FAILURE";
268  case -16: return "CL_LINKER_NOT_AVAILABLE";
269  case -17: return "CL_LINK_PROGRAM_FAILURE";
270  case -18: return "CL_DEVICE_PARTITION_FAILED";
271  case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
272 
273  // compile-time errors
274  case -30: return "CL_INVALID_VALUE";
275  case -31: return "CL_INVALID_DEVICE_TYPE";
276  case -32: return "CL_INVALID_PLATFORM";
277  case -33: return "CL_INVALID_DEVICE";
278  case -34: return "CL_INVALID_CONTEXT";
279  case -35: return "CL_INVALID_QUEUE_PROPERTIES";
280  case -36: return "CL_INVALID_COMMAND_QUEUE";
281  case -37: return "CL_INVALID_HOST_PTR";
282  case -38: return "CL_INVALID_MEM_OBJECT";
283  case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
284  case -40: return "CL_INVALID_IMAGE_SIZE";
285  case -41: return "CL_INVALID_SAMPLER";
286  case -42: return "CL_INVALID_BINARY";
287  case -43: return "CL_INVALID_BUILD_OPTIONS";
288  case -44: return "CL_INVALID_PROGRAM";
289  case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
290  case -46: return "CL_INVALID_KERNEL_NAME";
291  case -47: return "CL_INVALID_KERNEL_DEFINITION";
292  case -48: return "CL_INVALID_KERNEL";
293  case -49: return "CL_INVALID_ARG_INDEX";
294  case -50: return "CL_INVALID_ARG_VALUE";
295  case -51: return "CL_INVALID_ARG_SIZE";
296  case -52: return "CL_INVALID_KERNEL_ARGS";
297  case -53: return "CL_INVALID_WORK_DIMENSION";
298  case -54: return "CL_INVALID_WORK_GROUP_SIZE";
299  case -55: return "CL_INVALID_WORK_ITEM_SIZE";
300  case -56: return "CL_INVALID_GLOBAL_OFFSET";
301  case -57: return "CL_INVALID_EVENT_WAIT_LIST";
302  case -58: return "CL_INVALID_EVENT";
303  case -59: return "CL_INVALID_OPERATION";
304  case -60: return "CL_INVALID_GL_OBJECT";
305  case -61: return "CL_INVALID_BUFFER_SIZE";
306  case -62: return "CL_INVALID_MIP_LEVEL";
307  case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
308  case -64: return "CL_INVALID_PROPERTY";
309  case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
310  case -66: return "CL_INVALID_COMPILER_OPTIONS";
311  case -67: return "CL_INVALID_LINKER_OPTIONS";
312  case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
313 
314  // extension errors
315  case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
316  case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
317  case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
318  case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
319  case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
320  case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
321  case -9999: return "NVIDIA: ILLEGAL READ OR WRITE TO A BUFFER";
322  default:
323  fprintf(stderr, "'%d'\n", error);
324  return "Unknown OpenCL error";
325  }
326  }
327 };
bool error(const char *fmt, const Args &...args)
Definition: util.h:178
#define function(a, b, c, d, k, s)
#define EQUIHASH_K
Definition: libclwrapper.h:37
#define NUM_INDICES
Definition: libclwrapper.h:40
#define BLAKE_WPS
Definition: libclwrapper.h:11
uint32_t uint
Definition: libclwrapper.h:40
uint32_t parent_bucket_index
Definition: libclwrapper.h:51
char * s_hexdump(const void *_a, uint32_t a_len)
Definition: libclwrapper.h:142
concurrent_queue< JitTask > m_queue
Definition: SmartVM.cpp:60
uint8_t valid[MAX_SOLS]
Definition: libclwrapper.h:17
uint64_t ulong
Definition: libclwrapper.h:41
int compare_indices32(uint32_t *a, uint32_t *b, size_t n_current_indices)
Definition: libclwrapper.h:118
void sort_pair(uint32_t *a, uint32_t len)
Definition: libclwrapper.h:167
#define a(i)
uint32_t digest_index
Definition: libclwrapper.h:50
uint8_t uchar
Definition: libclwrapper.h:39
void normalize_indices(uint32_t *indices)
Definition: libclwrapper.h:130
const char * get_error_string(cl_int error)
Definition: libclwrapper.h:248
#define DIGEST_SIZE
Definition: libclwrapper.h:44
uint32_t eh_index
Definition: libclwrapper.h:66
#define b(i, j)
256-bit opaque blob.
Definition: uint256.h:132
CommandQueue interface for cl_command_queue.
Definition: cl.hpp:2566
#define NR_INPUTS
Definition: param.h:6
uint8_t const size_t const size
Definition: sha3.h:20
void * memcpy(void *a, const void *b, size_t c)
struct bucket bucket_t
struct debug_s debug_t
uint values[MAX_SOLS][512]
Definition: libclwrapper.h:18
Memory buffer interface.
Definition: cl.hpp:1748
size_t select_work_size_blake(void)
Definition: libclwrapper.h:152
uint32_t verify_sol(sols_t *sols, unsigned sol_i)
Definition: libclwrapper.h:183
#define PREFIX
Definition: param.h:5
Device interface for cl_device_id.
Definition: cl.hpp:1190
uint32_t eh_index
Definition: equihash.h:25
#define PARAM_K
Definition: param.h:4
uint8_t const * data
Definition: sha3.h:19
C++ bindings for OpenCL 1.0 (rev 48) and OpenCL 1.1 (rev 33)
uint64_t digest_t[(DIGEST_SIZE+sizeof(uint64_t)-1)/sizeof(uint64_t)]
Definition: libclwrapper.h:47