clustering  0.12
Clustering suite for molecular dynamics trajectories.
 All Classes Namespaces Files Functions Variables Typedefs
density_clustering_common.cpp
1 /*
2 Copyright (c) 2015, Florian Sittel (www.lettis.net)
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without modification,
6 are permitted provided that the following conditions are met:
7 
8 1. Redistributions of source code must retain the above copyright notice,
9  this list of conditions and the following disclaimer.
10 
11 2. Redistributions in binary form must reproduce the above copyright notice,
12  this list of conditions and the following disclaimer in the documentation
13  and/or other materials provided with the distribution.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
16 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
18 SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
20 OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
22 TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
23 EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25 
26 #include "logger.hpp"
28 
29 #ifdef DC_USE_MPI
30  #include "density_clustering_mpi.hpp"
31 #endif
32 
33 namespace Clustering {
34 namespace Density {
35 
36  std::vector<std::size_t>
37  initial_density_clustering(const std::vector<float>& free_energy
38  , const Neighborhood& nh
39  , const float free_energy_threshold
40  , const float* coords
41  , const std::size_t n_rows
42  , const std::size_t n_cols
43  , const std::vector<std::size_t> initial_clusters
44 #ifdef DC_USE_MPI
45  , const int mpi_n_nodes
46  , const int mpi_node_id
47 #endif
48  ) {
49 #ifdef DC_USE_MPI
50  using namespace Clustering::Density::MPI;
51 #endif
52  std::vector<std::size_t> clustering;
53  bool have_initial_clusters = (initial_clusters.size() == n_rows);
54  if (have_initial_clusters) {
55  clustering = initial_clusters;
56  } else {
57  clustering = std::vector<std::size_t>(n_rows);
58  }
59  // sort lowest to highest (low free energy = high density)
60  std::vector<FreeEnergy> fe_sorted = sorted_free_energies(free_energy);
61  // find last frame below free energy threshold
62  auto lb = std::upper_bound(fe_sorted.begin(),
63  fe_sorted.end(),
64  FreeEnergy(0, free_energy_threshold),
65  [](const FreeEnergy& d1, const FreeEnergy& d2) -> bool {return d1.second < d2.second;});
66  std::size_t first_frame_above_threshold = (lb - fe_sorted.begin());
67  // compute sigma as deviation of nearest-neighbor distances
68  // (beware: actually, sigma2 is E[x^2] > Var(x) = E[x^2] - E[x]^2,
69  // with x being the distances between nearest neighbors).
70  // then compute a neighborhood with distance 4*sigma2 only on high density frames.
71  double sigma2 = compute_sigma2(nh);
72 #ifdef DC_USE_MPI
73  if (mpi_node_id == MAIN_PROCESS) {
74 #endif
75  logger(std::cout) << "sigma2: " << sigma2 << std::endl
76  << first_frame_above_threshold << " frames with low free energy / high density" << std::endl
77  << "first frame above threshold has free energy: "
78  << fe_sorted[first_frame_above_threshold].second << std::endl
79  << "merging initial clusters" << std::endl;
80 #ifdef DC_USE_MPI
81  }
82 #endif
83  // initialize distinct name from initial clustering
84  std::size_t distinct_name = *std::max_element(clustering.begin(), clustering.end());
85  bool neighboring_clusters_merged = false;
86  std::set<std::size_t> visited_frames = {};
87  if (have_initial_clusters) {
88  // initialize visited_frames from initial clustering
89  // (with indices in order of sorted free energies)
90  for (std::size_t i=0; i < first_frame_above_threshold; ++i) {
91  std::size_t i_original = fe_sorted[i].first;
92  if (initial_clusters[i_original] != 0) {
93  visited_frames.insert(i);
94  }
95  }
96  logger(std::cout) << "# already visited: " << visited_frames.size() << std::endl;
97  }
98  // indices inside this loop are in order of sorted(!) free energies
99  while ( ! neighboring_clusters_merged) {
100  neighboring_clusters_merged = true;
101 #ifdef DC_USE_MPI
102  if (mpi_node_id == MAIN_PROCESS) {
103 #endif
104  logger(std::cout) << "initial merge iteration" << std::endl;
105 #ifdef DC_USE_MPI
106  }
107 #endif
108  for (std::size_t i=0; i < first_frame_above_threshold; ++i) {
109  if (visited_frames.count(i) == 0) {
110  visited_frames.insert(i);
111  // all frames/clusters in local neighborhood should be merged ...
112 #ifdef DC_USE_MPI
113  std::set<std::size_t> local_nh = Clustering::Density::MPI::high_density_neighborhood(coords,
114  n_cols,
115  fe_sorted,
116  i,
117  first_frame_above_threshold,
118  4*sigma2,
119  mpi_n_nodes,
120  mpi_node_id);
121 #else
122  std::set<std::size_t> local_nh = high_density_neighborhood(coords,
123  n_cols,
124  fe_sorted,
125  i,
126  first_frame_above_threshold,
127  4*sigma2);
128 #endif
129  // ... let's see if at least some of them already have a
130  // designated cluster assignment
131  std::set<std::size_t> cluster_names;
132  for (auto j: local_nh) {
133  cluster_names.insert(clustering[fe_sorted[j].first]);
134  }
135  if ( ! (cluster_names.size() == 1 && cluster_names.count(0) != 1)) {
136  neighboring_clusters_merged = false;
137  // remove the 'zero' state, i.e. state of unassigned frames
138  if (cluster_names.count(0) == 1) {
139  cluster_names.erase(0);
140  }
141  std::size_t common_name;
142  if (cluster_names.size() > 0) {
143  // indeed, there are already cluster assignments.
144  // these should now be merged under a common name.
145  // (which will be the id with smallest numerical value,
146  // due to the properties of STL-sets).
147  common_name = (*cluster_names.begin());
148  } else {
149  // no clustering of these frames yet.
150  // choose a distinct name.
151  common_name = ++distinct_name;
152  }
153  for (auto j: local_nh) {
154  clustering[fe_sorted[j].first] = common_name;
155  }
156  std::size_t j;
157  #pragma omp parallel for default(none) private(j)\
158  firstprivate(common_name,first_frame_above_threshold,cluster_names) \
159  shared(clustering,fe_sorted)
160  for (j=0; j < first_frame_above_threshold; ++j) {
161  if (cluster_names.count(clustering[fe_sorted[j].first]) == 1) {
162  clustering[fe_sorted[j].first] = common_name;
163  }
164  }
165  }
166  }
167  } // end for
168  } // end while
169  // normalize names
170  std::set<std::size_t> final_names;
171  for (std::size_t i=0; i < first_frame_above_threshold; ++i) {
172  final_names.insert(clustering[fe_sorted[i].first]);
173  }
174  std::map<std::size_t, std::size_t> old_to_new;
175  old_to_new[0] = 0;
176  std::size_t new_name=0;
177  for (auto name: final_names) {
178  old_to_new[name] = ++new_name;
179  }
180  // write clustered trajectory
181  for(auto& elem: clustering) {
182  elem = old_to_new[elem];
183  }
184  return clustering;
185  }
186 
187 } // end namespace Density
188 } // end namespace Clustering
189 
const int MAIN_PROCESS
identify MPI process 0 as main process
double compute_sigma2(const Neighborhood &nh)
Clustering::Tools::Neighborhood Neighborhood
map frame id to neighbors
std::ostream & logger(std::ostream &s)
Definition: logger.cpp:32
std::vector< std::size_t > initial_density_clustering(const std::vector< float > &free_energy, const Neighborhood &nh, const float free_energy_threshold, const float *coords, const std::size_t n_rows, const std::size_t n_cols, const std::vector< std::size_t > initial_clusters)
std::vector< FreeEnergy > sorted_free_energies(const std::vector< float > &fe)
std::pair< std::size_t, float > FreeEnergy
matches frame id to free energy
std::set< std::size_t > high_density_neighborhood(const float *coords, const std::size_t n_cols, const std::vector< FreeEnergy > &sorted_fe, const std::size_t i_frame, const std::size_t limit, const float max_dist, const int mpi_n_nodes, const int mpi_node_id)