clustering  0.12
Clustering suite for molecular dynamics trajectories.
 All Classes Namespaces Files Functions Variables Typedefs
density_clustering_opencl.cpp
1 /*
2 Copyright (c) 2015, Florian Sittel (www.lettis.net)
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without modification,
6 are permitted provided that the following conditions are met:
7 
8 1. Redistributions of source code must retain the above copyright notice,
9  this list of conditions and the following disclaimer.
10 
11 2. Redistributions in binary form must reproduce the above copyright notice,
12  this list of conditions and the following disclaimer in the documentation
13  and/or other materials provided with the distribution.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
16 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
18 SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
20 OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
22 TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
23 EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25 
26 #define __CL_ENABLE_EXCEPTIONS
27 
28 #include "density_clustering_opencl.hpp"
29 
30 #include <iostream>
31 
32 namespace Clustering {
33 namespace Density {
34 namespace OpenCL {
35 
36  std::map<float, std::vector<std::size_t>>
37  calculate_populations(const float* coords,
38  const std::size_t n_rows,
39  const std::size_t n_cols,
40  std::vector<float> radii) {
41  unsigned int uint_n_rows = (unsigned int) n_rows;
42  unsigned int uint_n_cols = (unsigned int) n_cols;
43  try {
44  std::vector<cl::Platform> platforms;
45  cl::Platform::get(&platforms);
46  cl_context_properties cps[3] = {
47  CL_CONTEXT_PLATFORM,
48  (cl_context_properties) (platforms[0])(),
49  0
50  };
51  cl::Context context(CL_DEVICE_TYPE_GPU, cps);
52  std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
53  std::vector<cl::CommandQueue> queues;
54  for (cl::Device dev: devices) {
55  queues.push_back(cl::CommandQueue(context, dev));
56  }
57  std::size_t n_queues = queues.size();
58  // compute needed/avail mem sizes
59  std::size_t n_bytes_per_row;
60  std::size_t n_bytes_global_mem;
61  n_bytes_per_row = n_cols * sizeof(float);
62  n_bytes_global_mem = devices[0].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
63  // do devices have enough memory for full data set?
64  if (n_bytes_per_row * n_rows + (sizeof(unsigned int)*n_rows) > n_bytes_global_mem) {
65  // TODO compute in chunks
66  std::cerr << "error: not enough memory on device for full data set.\n"
67  << " will support memory-size independet computation in a later release.\n"
68  << " please check for updates." << std::endl;
69  exit(EXIT_FAILURE);
70  }
71  // create buffers and copy data to device(s)
72  cl::Buffer buf_coords = cl::Buffer(context, CL_MEM_READ_ONLY, n_rows*n_cols*sizeof(float));
73  std::vector<cl::Buffer> buf_pops(n_queues);
74  for (int iq=0; iq < n_queues; ++iq) {
75  queues[iq].enqueueWriteBuffer(buf_coords, CL_TRUE, 0, n_rows*n_cols*sizeof(float), coords);
76  buf_pops[iq] = cl::Buffer(context, CL_MEM_WRITE_ONLY, n_rows*sizeof(unsigned int));
77  }
78 //TODO: fix
79  // load kernel source
80 // std::string kernel_src =
81 // #include "kernel/pops.h"
82 // ;
83  cl::Program::Sources src(1, {kernel_src.c_str(), kernel_src.length()+1});
84  cl::Program prog = cl::Program(context, src);
85  int err = prog.build(devices);
86  if (err != CL_SUCCESS) {
87  std::cerr << "error during kernel compilation" << std::endl;
88  std::cerr << prog.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]) << std::endl;
89  exit(EXIT_FAILURE);
90  }
91  // initialize pops for every queue to zero
92  cl::Kernel krnl_init_pops(prog, "init_pops");
93  for (int iq=0; iq < n_queues; ++iq) {
94  krnl_init_pops.setArg(0, buf_pops[iq]);
95  cl::NDRange full_global_range(n_rows);
96  cl::Event event;
97  queues[iq].enqueueNDRangeKernel(krnl_init_pops, cl::NullRange, full_global_range, cl::NullRange, NULL, &event);
98  // very short and simple kernel: just do a blocking call
99  event.wait();
100  }
101  cl::Kernel krnl_pops(prog, "pops");
102  // set kernel args
103  float rad2 = radii[0]*radii[0];
104  // TODO play with workgroup sizes
105  const unsigned int GLOBAL_SIZE = 1024;
106  const unsigned int WORKGROUP_SIZE = 128;
107  krnl_pops.setArg(2, sizeof(unsigned int), &uint_n_rows);
108  krnl_pops.setArg(3, sizeof(unsigned int), &uint_n_cols);
109  krnl_pops.setArg(4, buf_coords);
110  krnl_pops.setArg(5, sizeof(float), &rad2);
111  krnl_pops.setArg(7, sizeof(float)*WORKGROUP_SIZE*n_cols, NULL);
113  // extend range to a size that is evenly divisible by the workgroup size.
114  // the kernel will ignore all workitems with index >= n_cols.
115  unsigned int range_length = n_rows;
116  while (range_length % WORKGROUP_SIZE != 0) {
117  ++range_length;
118  }
119  cl::NDRange global(GLOBAL_SIZE);
120  cl::NDRange local(WORKGROUP_SIZE);
121  // run pops kernel repeatedly, until the full range has been sampled
122  for (unsigned int i_block_ref=0; i_block_ref < range_length; i_block_ref += WORKGROUP_SIZE) {
123  krnl_pops.setArg(1, sizeof(unsigned int), &i_block_ref);
124  for (unsigned int i_block=0; i_block < range_length; i_block += n_queues*GLOBAL_SIZE) {
125  std::vector<cl::Event> events(n_queues);
126  for (unsigned int iq=0; iq < n_queues; ++iq) {
127  unsigned int q_block = i_block + iq*GLOBAL_SIZE;
128  krnl_pops.setArg(0, sizeof(unsigned int), &q_block);
129  krnl_pops.setArg(6, buf_pops[iq]);
130  queues[iq].enqueueNDRangeKernel(kernel, cl::NullRange, global, local, NULL, &events[iq]);
131  }
132  cl::Event::waitForEvents(events);
133  }
134  }
135  // retrieve results from all devices
136  std::vector<std::size_t> pops(n_rows, 0);
137  for (int iq=0; iq < n_queues; ++iq) {
138  std::vector<unsigned int> partial_pops(n_rows);
139  queues[iq].enqueueReadBuffer(buf_pops[iq], CL_TRUE, 0, n_rows*sizeof(unsigned int), partial_pops.data());
140  for (std::size_t i=0; i < n_rows; ++i) {
141  pops[i] += partial_pops[i];
142  }
143  }
144  return {{radii[0], pops}};
145  } catch(cl::Error error) {
146  std::cerr << "error in OpenCL call: " << error.what() << "(" << error.err() << ")" << std::endl;
147  exit(EXIT_FAILURE);
148  }
149  }
150 
151 } // end namespace OpenCL
152 } // end namespace Density
153 } // end namespace Clustering
154 
std::map< float, std::vector< std::size_t > > calculate_populations(const float *coords, const std::size_t n_rows, const std::size_t n_cols, std::vector< float > radii)