28 #include "density_clustering.hpp"
31 #include "density_clustering_opencl.hpp"
36 namespace Clustering {
40 const std::size_t n_rows,
41 const std::size_t n_cols,
47 const int BOX_DIM_1 = 0;
48 const int BOX_DIM_2 = 1;
50 ASSUME_ALIGNED(coords);
52 float min_x1=0.0f, max_x1=0.0f, min_x2=0.0f, max_x2=0.0f;
53 min_x1=coords[0*n_cols+BOX_DIM_1];
54 max_x1=coords[0*n_cols+BOX_DIM_1];
56 min_x2=coords[0*n_cols+BOX_DIM_2];
57 max_x2=coords[0*n_cols+BOX_DIM_2];
60 for (std::size_t i=1; i < n_rows; ++i) {
61 min_x1 = std::min(min_x1, coords[i*n_cols+BOX_DIM_1]);
62 max_x1 = std::max(max_x1, coords[i*n_cols+BOX_DIM_1]);
64 min_x2 = std::min(min_x2, coords[i*n_cols+BOX_DIM_2]);
65 max_x2 = std::max(max_x2, coords[i*n_cols+BOX_DIM_2]);
69 grid.
n_boxes.push_back((max_x1 - min_x1) / radius + 1);
71 grid.
n_boxes.push_back((max_x2 - min_x2) / radius + 1);
76 for (std::size_t i=0; i < n_rows; ++i) {
77 int i_box_1=0, i_box_2=0;
78 i_box_1 = (coords[i*n_cols+BOX_DIM_1] - min_x1) / radius;
80 i_box_2 = (coords[i*n_cols+BOX_DIM_2] - min_x2) / radius;
90 return {center[0] +
BOX_DIFF[i_neighbor][0]
91 , center[1] + BOX_DIFF[i_neighbor][1]};
104 std::vector<std::size_t>
106 const std::size_t n_rows,
107 const std::size_t n_cols,
108 const float radius) {
109 std::vector<float> radii = {radius};
111 std::map<float, std::vector<std::size_t>> pop_map =
117 std::map<float, std::vector<std::size_t>> pop_map =
120 return pop_map[radius];
123 std::map<float, std::vector<std::size_t>>
125 const std::size_t n_rows,
126 const std::size_t n_cols,
127 std::vector<float> radii) {
128 std::map<float, std::vector<std::size_t>> pops;
129 for (
float rad: radii) {
130 pops[rad].resize(n_rows, 1);
132 std::sort(radii.begin(), radii.end(), std::greater<float>());
133 std::size_t n_radii = radii.size();
134 std::vector<float> rad2(n_radii);
135 for (std::size_t i=0; i < n_radii; ++i) {
136 rad2[i] = radii[i]*radii[i];
138 ASSUME_ALIGNED(coords);
139 std::size_t i, j, k, l, ib;
151 std::vector<int> box_buffer;
152 #pragma omp parallel for default(none) private(i,box,box_buffer,center,i_neighbor,ib,dist,j,k,l,c) \
153 firstprivate(n_rows,n_cols,n_radii,radii,rad2,N_NEIGHBOR_BOXES) \
154 shared(coords,pops,grid) \
155 schedule(dynamic,1024)
156 for (i=0; i < n_rows; ++i) {
162 box_buffer = grid.
boxes[box];
164 for (ib=0; ib < box_buffer.size(); ++ib) {
168 #pragma simd reduction(+:dist)
169 for (k=0; k < n_cols; ++k) {
170 c = coords[i*n_cols+k] - coords[j*n_cols+k];
173 for (l=0; l < n_radii; ++l) {
174 if (dist < rad2[l]) {
176 pops[radii[l]][i] += 1;
178 pops[radii[l]][j] += 1;
196 const std::size_t n_frames = pops.size();
197 const float max_pop = (float) ( * std::max_element(pops.begin(), pops.end()));
198 std::vector<float> fe(n_frames);
199 #pragma omp parallel for default(none) private(i) firstprivate(max_pop, n_frames) shared(fe, pops)
200 for (i=0; i < n_frames; ++i) {
201 fe[i] = (float) -1 * log(pops[i]/max_pop);
206 std::vector<FreeEnergy>
208 std::vector<FreeEnergy> fe_sorted;
209 for (std::size_t i=0; i < fe.size(); ++i) {
213 std::sort(fe_sorted.begin(),
219 std::tuple<Neighborhood, Neighborhood>
221 const std::size_t n_rows,
222 const std::size_t n_cols,
223 const std::vector<float>& free_energy) {
227 for (std::size_t i=0; i < n_rows; ++i) {
228 nh[i] =
Neighbor(n_rows+1, std::numeric_limits<float>::max());
229 nh_high_dens[i] =
Neighbor(n_rows+1, std::numeric_limits<float>::max());
232 std::size_t i, j, c, min_j, min_j_high_dens;
233 float dist, d, mindist, mindist_high_dens;
234 ASSUME_ALIGNED(coords);
235 #pragma omp parallel for default(none) \
236 private(i,j,c,dist,d,mindist,mindist_high_dens,min_j,min_j_high_dens) \
237 firstprivate(n_rows,n_cols) \
238 shared(coords,nh,nh_high_dens,free_energy) \
239 schedule(dynamic, 2048)
240 for (i=0; i < n_rows; ++i) {
241 mindist = std::numeric_limits<float>::max();
242 mindist_high_dens = std::numeric_limits<float>::max();
244 min_j_high_dens = n_rows+1;
245 for (j=0; j < n_rows; ++j) {
248 #pragma simd reduction(+:dist)
249 for (c=0; c < n_cols; ++c) {
250 d = coords[i*n_cols+c] - coords[j*n_cols+c];
254 if (dist < mindist) {
259 if (free_energy[j] < free_energy[i] && dist < mindist_high_dens) {
260 mindist_high_dens = dist;
266 nh_high_dens[i] =
Neighbor(min_j_high_dens, mindist_high_dens);
268 return std::make_tuple(nh, nh_high_dens);
273 std::set<std::size_t>
275 const std::size_t n_cols,
276 const std::vector<FreeEnergy>& sorted_fe,
277 const std::size_t i_frame,
278 const std::size_t limit,
279 const float max_dist) {
282 std::vector<int> frame_in_nh(limit, 0);
283 std::set<std::size_t> nh;
285 const std::size_t i_frame_sorted = sorted_fe[i_frame].first * n_cols;
287 ASSUME_ALIGNED(coords);
288 #pragma omp parallel for default(none) private(j,c,d,dist2) \
289 firstprivate(i_frame,i_frame_sorted,limit,max_dist,n_cols) \
290 shared(coords,sorted_fe,frame_in_nh)
291 for (j=0; j < limit; ++j) {
294 #pragma simd reduction(+:dist2)
295 for (c=0; c < n_cols; ++c) {
296 d = coords[i_frame_sorted+c] - coords[sorted_fe[j].first*n_cols+c];
299 if (dist2 < max_dist) {
305 for (j=0; j < limit; ++j) {
306 if (frame_in_nh[j] > 0) {
317 for (
auto match: nh) {
320 sigma2 += match.second.second;
322 return (sigma2 / nh.size());
325 std::vector<std::size_t>
328 const std::vector<float>& free_energy) {
330 std::vector<std::size_t> clustering(initial_clustering);
331 for (
const auto& fe: fe_sorted) {
332 std::size_t
id = fe.first;
333 if (clustering[
id] == 0) {
334 std::size_t neighbor_id = nh_high_dens.find(
id)->second.first;
336 clustering[id] = clustering[neighbor_id];
344 main(boost::program_options::variables_map args) {
345 using namespace Clustering::Tools;
346 const std::string input_file = args[
"file"].as<std::string>();
353 std::tie(coords, n_rows, n_cols) = read_coords<float>(input_file);
355 std::vector<float> free_energies;
356 if (args.count(
"free-energy-input")) {
359 }
else if (args.count(
"free-energy") || args.count(
"population") || args.count(
"output")) {
360 if (args.count(
"radii")) {
362 if (args.count(
"output")) {
363 std::cerr <<
"error: clustering cannot be done with several radii (-R is set)." << std::endl;
366 if ( ! (args.count(
"population") || args.count(
"free-energy"))) {
367 std::cerr <<
"error: no output defined for populations or free energies. why did you define -R ?" << std::endl;
370 std::vector<float> radii = args[
"radii"].as<std::vector<float>>();
374 std::map<float, std::vector<std::size_t>> pops =
calculate_populations(coords, n_rows, n_cols, radii);
376 for (
auto radius_pops: pops) {
377 if (args.count(
"population")) {
378 std::string basename_pop = args[
"population"].as<std::string>() +
"_%f";
381 if (args.count(
"free-energy")) {
382 std::string basename_fe = args[
"free-energy"].as<std::string>() +
"_%f";
387 if ( ! args.count(
"radius")) {
388 std::cerr <<
"error: radius (-r) is required!" << std::endl;
390 const float radius = args[
"radius"].as<
float>();
394 if (args.count(
"population")) {
395 write_pops(args[
"population"].as<std::string>(), pops);
399 if (args.count(
"free-energy")) {
400 write_fes(args[
"free-energy"].as<std::string>(), free_energies);
407 if (args.count(
"nearest-neighbors-input")) {
409 auto nh_pair =
read_neighborhood(args[
"nearest-neighbors-input"].as<std::string>());
411 nh_high_dens = nh_pair.second;
412 }
else if (args.count(
"nearest-neighbors") || args.count(
"output")) {
414 if ( ! args.count(
"radius")) {
415 std::cerr <<
"error: radius (-r) is required!" << std::endl;
418 nh = std::get<0>(nh_tuple);
419 nh_high_dens = std::get<1>(nh_tuple);
420 if (args.count(
"nearest-neighbors")) {
425 if (args.count(
"output")) {
426 const std::string output_file = args[
"output"].as<std::string>();
427 std::vector<std::size_t> clustering;
428 if (args.count(
"input")) {
431 }
else if (args.count(
"threshold-screening")) {
432 std::vector<float> threshold_params = args[
"threshold-screening"].as<std::vector<float>>();
433 if (threshold_params.size() > 3) {
434 std::cerr <<
"error: option -T expects at most three floating point arguments: FROM STEP TO." << std::endl;
437 Clustering::logger(std::cout) <<
"running free energy landscape screening" << std::endl;
440 float t_to = *std::max_element(free_energies.begin(), free_energies.end());
441 if (threshold_params.size() >= 1 && threshold_params[0] >= 0.0f) {
442 t_from = threshold_params[0];
444 if (threshold_params.size() >= 2) {
445 t_step = threshold_params[1];
447 if (threshold_params.size() == 3) {
448 t_to = threshold_params[2];
450 std::vector<std::size_t> clustering(n_rows);
454 float t_to_low = t_to - t_step/10.0f + t_step;
455 float t_to_high = t_to + t_step/10.0f + t_step;
456 for (
float t=t_from; (t < t_to_low) && !(t_to_high < t); t += t_step) {
466 if (args.count(
"threshold") == 0) {
467 std::cerr <<
"error: need threshold value for initial clustering" << std::endl;
470 float threshold = args[
"threshold"].as<
float>();
473 if ( ! args[
"only-initial"].as<bool>() && ( ! args.count(
"threshold-screening"))) {
474 Clustering::logger(std::cout) <<
"assigning low density states to initial clusters" << std::endl;
477 Clustering::logger(std::cout) <<
"writing clusters to file " << output_file << std::endl;
478 write_single_column<std::size_t>(output_file, clustering);
std::vector< std::size_t > assign_low_density_frames(const std::vector< std::size_t > &initial_clustering, const Neighborhood &nh_high_dens, const std::vector< float > &free_energy)
double compute_sigma2(const Neighborhood &nh)
const int N_NEIGHBOR_BOXES
number of neigbor boxes in 2D grid (including center box).
bool is_valid_box(const Box box, const BoxGrid &grid)
std::set< std::size_t > high_density_neighborhood(const float *coords, const std::size_t n_cols, const std::vector< FreeEnergy > &sorted_fe, const std::size_t i_frame, const std::size_t limit, const float max_dist)
std::map< float, std::vector< std::size_t > > calculate_populations(const float *coords, const std::size_t n_rows, const std::size_t n_cols, std::vector< float > radii)
void main(boost::program_options::variables_map args)
Clustering::Tools::Neighborhood Neighborhood
map frame id to neighbors
std::vector< int > n_boxes
total number of boxes
std::map< Box, std::vector< int > > boxes
the boxes with a list of assigned frame ids
std::tuple< Neighborhood, Neighborhood > nearest_neighbors(const float *coords, const std::size_t n_rows, const std::size_t n_cols, const std::vector< float > &free_energy)
constexpr int BOX_DIFF[9][2]
std::ostream & logger(std::ostream &s)
std::vector< std::size_t > initial_density_clustering(const std::vector< float > &free_energy, const Neighborhood &nh, const float free_energy_threshold, const float *coords, const std::size_t n_rows, const std::size_t n_cols, const std::vector< std::size_t > initial_clusters)
BoxGrid compute_box_grid(const float *coords, const std::size_t n_rows, const std::size_t n_cols, const float radius)
std::vector< std::size_t > calculate_populations(const float *coords, const std::size_t n_rows, const std::size_t n_cols, const float radius)
calculate population of n-dimensional hypersphere per frame for one fixed radius. ...
constexpr Box neighbor_box(const Box center, const int i_neighbor)
std::vector< float > calculate_free_energies(const std::vector< std::size_t > &pops)
std::vector< FreeEnergy > sorted_free_energies(const std::vector< float > &fe)
std::pair< std::size_t, float > FreeEnergy
matches frame id to free energy
std::array< int, 2 > Box
encodes 2D box for box-assisted search algorithm
Clustering::Tools::Neighbor Neighbor
matches neighbor's frame id to distance
std::vector< Box > assigned_box
matching frame id to the frame's assigned box