clustering  0.12
Clustering suite for molecular dynamics trajectories.
 All Classes Namespaces Files Functions Variables Typedefs
mpp.cpp
1 /*
2 Copyright (c) 2015, Florian Sittel (www.lettis.net)
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without modification,
6 are permitted provided that the following conditions are met:
7 
8 1. Redistributions of source code must retain the above copyright notice,
9  this list of conditions and the following disclaimer.
10 
11 2. Redistributions in binary form must reproduce the above copyright notice,
12  this list of conditions and the following disclaimer in the documentation
13  and/or other materials provided with the distribution.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
16 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
18 SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
20 OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
22 TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
23 EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25 
26 #include "tools.hpp"
27 #include "mpp.hpp"
28 #include "logger.hpp"
29 
30 namespace Clustering {
31  namespace MPP {
33  transition_counts(std::vector<std::size_t> trajectory,
34  std::vector<std::size_t> concat_limits,
35  std::size_t n_lag_steps) {
36  if (n_lag_steps == 0) {
37  std::cerr << "error: lagtime of 0 does not make any sense for MPP clustering" << std::endl;
38  exit(EXIT_FAILURE);
39  }
40  std::vector<std::size_t>::iterator next_limit = concat_limits.begin();
41  std::size_t i_max = (*std::max_element(trajectory.begin(), trajectory.end()));
42  SparseMatrixF count_matrix(i_max+1, i_max+1);
43  for (std::size_t i=0; i < trajectory.size() - n_lag_steps; ++i) {
44  std::size_t from = trajectory[i];
45  std::size_t to = trajectory[i+n_lag_steps];
46  if (next_limit != concat_limits.end()) {
47  // check for sub-trajectory limits
48  if (i+n_lag_steps < (*next_limit)) {
49  count_matrix(from, to) += 1;
50  } else if (i+1 == (*next_limit)) {
51  ++next_limit;
52  }
53  } else {
54  // either last sub-trajectory or everything is
55  // a single, continuous trajectory
56  count_matrix(from, to) += 1;
57  }
58  }
59  return count_matrix;
60  }
61 
64  std::set<std::size_t> cluster_names) {
65  std::size_t n_rows = count_matrix.size1();
66  std::size_t n_cols = count_matrix.size2();
67  SparseMatrixF transition_matrix(n_rows, n_cols);
68  for (std::size_t i: cluster_names) {
69  std::size_t row_sum = 0;
70  for (std::size_t j=0; j < n_cols; ++j) {
71  row_sum += count_matrix(i,j);
72  }
73  if (row_sum > 0) {
74  for (std::size_t j=0; j < n_cols; ++j) {
75  if (count_matrix(i,j) != 0) {
76  transition_matrix(i,j) = count_matrix(i,j) / row_sum;
77  }
78  }
79  }
80  }
81  return transition_matrix;
82  }
83 
84  std::map<std::size_t, std::size_t>
86  std::set<std::size_t> cluster_names,
87  float q_min,
88  std::map<std::size_t, float> min_free_energy) {
89  std::map<std::size_t, std::size_t> future_state;
90  for (std::size_t i: cluster_names) {
91  std::vector<std::size_t> candidates;
92  float max_trans_prob = 0.0f;
93  if (transition_matrix(i,i) >= q_min) {
94  // self-transition is greater than stability measure: stay.
95  candidates = {i};
96  } else {
97  for (std::size_t j: cluster_names) {
98  // self-transition lower than q_min:
99  // choose other state as immidiate future
100  // (even if it has lower probability than self-transition)
101  if (i != j) {
102  if (transition_matrix(i,j) > max_trans_prob) {
103  max_trans_prob = transition_matrix(i,j);
104  candidates = {j};
105  } else if (transition_matrix(i,j) == max_trans_prob && max_trans_prob > 0.0f) {
106  candidates.push_back(j);
107  }
108  }
109  }
110  }
111  if (candidates.size() == 0) {
112  std::cerr << "error: state '" << i
113  << "' has self-transition probability of "
114  << transition_matrix(i,i)
115  << " at Qmin "
116  << q_min
117  << " and does not find any transition candidates."
118  << " please have a look at your trajectory!"
119  << std::endl;
120  exit(EXIT_FAILURE);
121  } else if (candidates.size() == 1) {
122  future_state[i] = candidates[0];
123  } else {
124  // multiple candidates: choose the one with lowest Free Energy
125  auto min_fe_compare = [&](std::size_t i, std::size_t j) {
126  return min_free_energy[i] < min_free_energy[j];
127  };
128  future_state[i] = (*std::min_element(candidates.begin(), candidates.end(), min_fe_compare));
129  }
130  }
131  return future_state;
132  }
133 
134  std::map<std::size_t, std::vector<std::size_t>>
135  most_probable_path(std::map<std::size_t, std::size_t> future_state,
136  std::set<std::size_t> cluster_names) {
137  std::map<std::size_t, std::vector<std::size_t>> mpp;
138  for (std::size_t i: cluster_names) {
139  std::vector<std::size_t> path = {i};
140  std::set<std::size_t> visited = {i};
141  std::size_t next_state = future_state[i];
142  // abort when path 'closes' in a loop, i.e.
143  // when a state has been revisited
144  while (visited.count(next_state) == 0) {
145  path.push_back(next_state);
146  visited.insert(next_state);
147  next_state = future_state[next_state];
148  }
149  mpp[i] = path;
150  }
151  return mpp;
152  }
153 
154  std::map<std::size_t, std::size_t>
155  microstate_populations(std::vector<std::size_t> clusters,
156  std::set<std::size_t> cluster_names) {
157  std::map<std::size_t, std::size_t> pops;
158  for (std::size_t i: cluster_names) {
159  pops[i] = std::count(clusters.begin(), clusters.end(), i);
160  }
161  return pops;
162  }
163 
164  // assign every state the lowest free energy value
165  // of all of its frames.
166  std::map<std::size_t, float>
167  microstate_min_free_energy(const std::vector<std::size_t>& clustering,
168  const std::vector<float>& free_energy) {
169  std::map<std::size_t, float> min_fe;
170  for (std::size_t i=0; i < clustering.size(); ++i) {
171  std::size_t state = clustering[i];
172  if (min_fe.count(state) == 0) {
173  min_fe[state] = free_energy[i];
174  } else {
175  if (free_energy[i] < min_fe[state]) {
176  min_fe[state] = free_energy[i];
177  }
178  }
179  }
180  return min_fe;
181  }
182 
183  std::map<std::size_t, std::size_t>
184  path_sinks(std::vector<std::size_t> clusters,
185  std::map<std::size_t, std::vector<std::size_t>> mpp,
186  SparseMatrixF transition_matrix,
187  std::set<std::size_t> cluster_names,
188  float q_min,
189  std::vector<float> free_energy) {
190  std::map<std::size_t, std::size_t> pops = microstate_populations(clusters, cluster_names);
191  std::map<std::size_t, float> min_free_energy = microstate_min_free_energy(clusters, free_energy);
192  std::map<std::size_t, std::size_t> sinks;
193  for (std::size_t i: cluster_names) {
194  std::vector<std::size_t> metastable_states;
195  for (std::size_t j: mpp[i]) {
196  // check: are there stable states?
197  if (transition_matrix(j,j) > q_min) {
198  metastable_states.push_back(j);
199  }
200  }
201  if (metastable_states.size() == 0) {
202  // no stable state: treat all states in path as 'metastable'
203  metastable_states = mpp[i];
204  }
205  // helper function: compare states by their population
206  auto pop_compare = [&](std::size_t i, std::size_t j) -> bool {
207  return pops[i] < pops[j];
208  };
209  // helper function: compare states by their min. Free Energy
210  auto fe_compare = [&](std::size_t i, std::size_t j) -> bool {
211  return min_free_energy[i] < min_free_energy[j];
212  };
213  // find sink candidate state from lowest free energy
214  auto candidate = std::min_element(metastable_states.begin(), metastable_states.end(), fe_compare);
215  float min_fe = free_energy[*candidate];
216  std::set<std::size_t> sink_candidates;
217  while (candidate != metastable_states.end() && free_energy[*candidate] == min_fe) {
218  // there may be several states with same (min.) free energy,
219  // collect them all into one set
220  sink_candidates.insert(*candidate);
221  metastable_states.erase(candidate);
222  candidate = std::min_element(metastable_states.begin(), metastable_states.end(), fe_compare);
223  }
224  // select sink by lowest free energy
225  if (sink_candidates.size() == 1) {
226  sinks[i] = (*sink_candidates.begin());
227  } else {
228  // or highest population, if equal
229  sinks[i] = (*std::max_element(sink_candidates.begin(), sink_candidates.end(), pop_compare));
230  }
231  }
232  return sinks;
233  }
234 
235  // lump states based on path sinks and return new trajectory.
236  // new microstates will have IDs of sinks.
237  std::vector<std::size_t>
238  lumped_trajectory(std::vector<std::size_t> trajectory,
239  std::map<std::size_t, std::size_t> sinks) {
240  for (std::size_t& state: trajectory) {
241  state = sinks[state];
242  }
243  return trajectory;
244  }
245 
246  // run clustering for given Q_min value
247  // returns: {new traj, lumping info}
248  std::tuple<std::vector<std::size_t>, std::map<std::size_t, std::size_t>>
249  fixed_metastability_clustering(std::vector<std::size_t> initial_trajectory,
250  std::vector<std::size_t> concat_limits,
251  float q_min,
252  std::size_t lagtime,
253  std::vector<float> free_energy) {
254  std::set<std::size_t> microstate_names;
255  std::vector<std::size_t> traj = initial_trajectory;
256  std::map<std::size_t, std::size_t> lumping;
257  const uint MAX_ITER=100;
258  uint iter;
259  for (iter=0; iter < MAX_ITER; ++iter) {
260  // reset names in case of vanished states (due to lumping)
261  microstate_names = std::set<std::size_t>(traj.begin(), traj.end());
262  if (microstate_names.count(0)) {
263  std::cerr << "\nwarning:\n"
264  << " there is a state '0' in your trajectory.\n"
265  << " are you sure you generated a proper trajectory of microstates\n"
266  << " (e.g. by running a final, seeded density-clustering to fill up the FEL)?\n"
267  << std::endl;
268  }
269  logger(std::cout) << "iteration "
270  << iter+1
271  << " for q_min "
272  << Clustering::Tools::stringprintf("%0.3f", q_min)
273  << std::endl;
274  // get transition probabilities
275  logger(std::cout) << " calculating transition probabilities" << std::endl;
277  transition_counts(traj, concat_limits, lagtime),
278  microstate_names);
279  // get immediate future
280  logger(std::cout) << " calculating future states" << std::endl;
281  std::map<std::size_t, std::size_t> future_state = single_step_future_state(trans_prob,
282  microstate_names,
283  q_min,
284  microstate_min_free_energy(traj, free_energy));
285  // compute MPP
286  logger(std::cout) << " calculating most probable path" << std::endl;
287  std::map<std::size_t, std::vector<std::size_t>> mpp = most_probable_path(future_state, microstate_names);
288  // compute sinks (i.e. states with lowest Free Energy per path)
289  logger(std::cout) << " calculating path sinks" << std::endl;
290  std::map<std::size_t, std::size_t> sinks = path_sinks(traj
291  , mpp
292  , trans_prob
293  , microstate_names
294  , q_min
295  , free_energy);
296  // lump trajectory into sinks
297  std::vector<std::size_t> traj_old = traj;
298  logger(std::cout) << " lumping trajectory" << std::endl;
299  traj = lumped_trajectory(traj, sinks);
300  for (auto from_to: sinks) {
301  std::size_t from = from_to.first;
302  std::size_t to = from_to.second;
303  if (from != to) {
304  lumping[from] = to;
305  }
306  }
307  // check convergence
308  if (traj_old == traj) {
309  break;
310  }
311  }
312  if (iter == MAX_ITER) {
313  throw std::runtime_error(Clustering::Tools::stringprintf("reached max. no. of iterations for Q_min convergence: %d", iter));
314  } else {
315  return std::make_tuple(traj, lumping);
316  }
317  }
318 
319  void
320  main(boost::program_options::variables_map args) {
321  std::string basename = args["basename"].as<std::string>();
322  // load initial trajectory
323  std::map<std::size_t, std::pair<std::size_t, float>> transitions;
324  std::map<std::size_t, std::size_t> max_pop;
325  std::map<std::size_t, float> max_qmin;
326  Clustering::logger(std::cout) << "loading microstates" << std::endl;
327  std::vector<std::size_t> traj = Clustering::Tools::read_clustered_trajectory(args["input"].as<std::string>());
328  Clustering::logger(std::cout) << "loading free energies" << std::endl;
329  std::vector<float> free_energy = Clustering::Tools::read_free_energies(args["free-energy-input"].as<std::string>());
330  float q_min_from = args["qmin-from"].as<float>();
331  float q_min_to = args["qmin-to"].as<float>();
332  float q_min_step = args["qmin-step"].as<float>();
333  int lagtime = args["lagtime"].as<int>();
334  Clustering::logger(std::cout) << "beginning q_min loop" << std::endl;
335  std::vector<std::size_t> concat_limits;
336  if (args.count("concat-limits")) {
337  concat_limits = Clustering::Tools::read_single_column<std::size_t>(args["concat-limits"].as<std::string>());
338  } else if (args.count("concat-nframes")) {
339  std::size_t n_frames_per_subtraj = args["concat-nframes"].as<std::size_t>();
340  for (std::size_t i=n_frames_per_subtraj; i < traj.size(); i += n_frames_per_subtraj) {
341  concat_limits.push_back(i);
342  }
343  }
344  for (float q_min=q_min_from; q_min <= q_min_to; q_min += q_min_step) {
345  auto traj_sinks = fixed_metastability_clustering(traj, concat_limits, q_min, lagtime, free_energy);
346  // write trajectory at current Qmin level to file
347  traj = std::get<0>(traj_sinks);
349  , basename.c_str()
350  , q_min)
351  , traj);
352  // save transitions (i.e. lumping of states)
353  std::map<std::size_t, std::size_t> sinks = std::get<1>(traj_sinks);
354  for (auto from_to: sinks) {
355  transitions[from_to.first] = {from_to.second, q_min};
356  }
357  // write microstate populations to file
358  std::map<std::size_t, std::size_t> pops = Clustering::Tools::microstate_populations(traj);
359  Clustering::Tools::write_map<std::size_t, std::size_t>(Clustering::Tools::stringprintf("%s_pop_%0.3f.dat"
360  , basename.c_str()
361  , q_min)
362  , pops);
363  // collect max. pops + max. q_min per microstate
364  for (std::size_t id: std::set<std::size_t>(traj.begin(), traj.end())) {
365  max_pop[id] = pops[id];
366  max_qmin[id] = q_min;
367  }
368  }
369  // write transitions to file
370  {
371  std::ofstream ofs(basename + "_transitions.dat");
372  for (auto trans: transitions) {
373  ofs << trans.first << " " << trans.second.first << " " << trans.second.second << "\n";
374  }
375  }
376  Clustering::Tools::write_map<std::size_t, std::size_t>(basename + "_max_pop.dat", max_pop);
377  Clustering::Tools::write_map<std::size_t, float>(basename + "_max_qmin.dat", max_qmin);
378  }
379  } // end namespace MPP
380 } // end namespace Clustering
381 
boost::numeric::ublas::mapped_matrix< float > SparseMatrixF
BOOST implementation of a sparse matrix for floats.
Definition: mpp.hpp:42
SparseMatrixF row_normalized_transition_probabilities(SparseMatrixF count_matrix, std::set< std::size_t > cluster_names)
compute transition matrix from counts by normalization of rows
Definition: mpp.cpp:63
std::map< std::size_t, std::size_t > microstate_populations(std::vector< std::size_t > traj)
compute microstate populations from clustered trajectory
Definition: tools.cpp:163
std::vector< float > read_free_energies(std::string filename)
read free energies from plain text file
Definition: tools.cpp:111
std::map< std::size_t, std::size_t > microstate_populations(std::vector< std::size_t > clusters, std::set< std::size_t > cluster_names)
compute cluster populations
Definition: mpp.cpp:155
std::map< std::size_t, float > microstate_min_free_energy(const std::vector< std::size_t > &clustering, const std::vector< float > &free_energy)
Definition: mpp.cpp:167
std::map< std::size_t, std::vector< std::size_t > > most_probable_path(std::map< std::size_t, std::size_t > future_state, std::set< std::size_t > cluster_names)
Definition: mpp.cpp:135
void write_single_column(std::string filename, std::vector< NUM > dat, bool with_scientific_format=false)
write single column of numbers to given file. number type (int, float, ...) given as template paramet...
Definition: tools.hxx:147
std::map< std::size_t, std::size_t > path_sinks(std::vector< std::size_t > clusters, std::map< std::size_t, std::vector< std::size_t >> mpp, SparseMatrixF transition_matrix, std::set< std::size_t > cluster_names, float q_min, std::vector< float > free_energy)
Definition: mpp.cpp:184
std::ostream & logger(std::ostream &s)
Definition: logger.cpp:32
std::map< std::size_t, std::size_t > single_step_future_state(SparseMatrixF transition_matrix, std::set< std::size_t > cluster_names, float q_min, std::map< std::size_t, float > min_free_energy)
Definition: mpp.cpp:85
SparseMatrixF transition_counts(std::vector< std::size_t > trajectory, std::vector< std::size_t > concat_limits, std::size_t n_lag_steps)
Definition: mpp.cpp:33
std::tuple< std::vector< std::size_t >, std::map< std::size_t, std::size_t > > fixed_metastability_clustering(std::vector< std::size_t > initial_trajectory, std::vector< std::size_t > concat_limits, float q_min, std::size_t lagtime, std::vector< float > free_energy)
run clustering for given Q_min value
Definition: mpp.cpp:249
std::vector< std::size_t > lumped_trajectory(std::vector< std::size_t > trajectory, std::map< std::size_t, std::size_t > sinks)
Definition: mpp.cpp:238
std::vector< std::size_t > read_clustered_trajectory(std::string filename)
read states from trajectory (given as plain text file)
Definition: tools.cpp:54
void main(boost::program_options::variables_map args)
Definition: mpp.cpp:320
std::string stringprintf(const std::string &str,...)
printf-version for std::string
Definition: tools.cpp:95