clustering  0.12
Clustering suite for molecular dynamics trajectories.
 All Classes Namespaces Files Functions Variables Typedefs
network_builder.cpp
1 /*
2 Copyright (c) 2015, Florian Sittel (www.lettis.net)
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without modification,
6 are permitted provided that the following conditions are met:
7 
8 1. Redistributions of source code must retain the above copyright notice,
9  this list of conditions and the following disclaimer.
10 
11 2. Redistributions in binary form must reproduce the above copyright notice,
12  this list of conditions and the following disclaimer in the documentation
13  and/or other materials provided with the distribution.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
16 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
18 SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
20 OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
22 TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
23 EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25 
26 #include "network_builder.hpp"
27 
28 #include "tools.hpp"
29 #include "logger.hpp"
30 #include "embedded_cytoscape.hpp"
31 
32 #include <fstream>
33 #include <set>
34 #include <unordered_set>
35 #include <limits>
36 
37 #include <boost/program_options.hpp>
38 #include <boost/filesystem.hpp>
39 #include <omp.h>
40 
41 
42 namespace {
46  int HORIZONTAL_SPACING = 10;
47  int VERTICAL_SPACING = 50;
48 // int NODE_SIZE_MIN = 5;
49 // int NODE_SIZE_MAX = 50;
50  // the following values will be set by
51  // 'save_network_to_html'
52  // and are stored here for convenience.
53  // needed by 'Node'-instances for
54  // construction of graphical network,
55  // i.e. to determine node sizes and colors.
56  std::size_t POP_MIN = 0;
57  std::size_t POP_MAX = 0;
58  float FE_MIN = 0.0f;
59  float FE_MAX = 0.0f;
61 
62  struct Node {
63  std::size_t id;
64  float fe;
65  std::size_t pop;
66  std::map<std::size_t, Node> children;
67  int pos_x = 0;
68  int pos_y = 0;
69  int _subtree_width = 0;
70 
71  Node();
72  Node(std::size_t _id, float _fe, std::size_t _pop);
73  Node* find_parent_of(std::size_t search_id);
74  void set_pos(int x, int y);
75  int subtree_width();
76  void print_subtree(std::ostream& os);
77  void print_node_and_subtree(std::ostream& os);
78  };
79 
80  constexpr bool fuzzy_equal(float a, float b, float prec) {
81  return (a <= b + prec) && (a >= b - prec);
82  }
83 
84  // overload output operator for Node-serialization
85  // (producing string representation of node + edges to children)
86  std::ostream& operator<<(std::ostream& os, const Node& n) {
87  float log_pop;
88  if (n.pop <= 0) {
89  log_pop = log(1);
90  } else {
91  log_pop = log(n.pop);
92  }
93  // print node itself
94  os << Clustering::Tools::stringprintf("{group:'nodes',id:'n%d',position:{x:%d,y:%d},data:{id:'n%d',pop:%d,fe:%f,info:'%d: fe=%0.2f, pop=%d',logpop:%0.2f}},\n",
95  n.id, n.pos_x, n.pos_y, n.id, n.pop, n.fe, n.id, n.fe, n.pop, log_pop);
96  // print edges from node's children to node
97  for (auto& id_child: n.children) {
98  std::size_t cid = id_child.first;
99  os << Clustering::Tools::stringprintf("{group:'edges',data:{id:'e%d_%d',source:'n%d',target:'n%d'}},\n", cid, n.id, cid, n.id);
100  }
101  return os;
102  }
103 
104  Node::Node() {}
105 
106  Node::Node(std::size_t _id,
107  float _fe,
108  std::size_t _pop)
109  : id(_id)
110  , fe(_fe)
111  , pop(_pop) {
112  }
113 
114  Node* Node::find_parent_of(std::size_t search_id) {
115  if (this->children.count(search_id)) {
116  return this;
117  } else {
118  for (auto& id_child: children) {
119  Node* cc = id_child.second.find_parent_of(search_id);
120  if (cc) {
121  return cc;
122  }
123  }
124  }
125  return NULL;
126  }
127 
128  void Node::set_pos(int x, int y) {
129  this->pos_x = x;
130  this->pos_y = y;
131  std::vector<int> width_children;
132  int total_width = 0;
133  for (auto& id_child: children) {
134  width_children.push_back(id_child.second.subtree_width());
135  total_width += id_child.second.subtree_width();
136  }
137  // set pos for every child recursively,
138  // regarding proper segmentation of horizontal space.
139  int cur_x = (x-0.5*total_width);
140  for (auto& id_child: children) {
141  int stw = id_child.second.subtree_width();
142  id_child.second.set_pos(cur_x + 0.5*stw, y + VERTICAL_SPACING);
143  cur_x += stw;
144  }
145  }
146 
147  int Node::subtree_width() {
148  // check for backtracked value
149  // and compute if not existing.
150  if ( ! this->_subtree_width) {
151  int self_width = 10 + 2*HORIZONTAL_SPACING; //TODO get from size
152  if (children.empty()) {
153  this->_subtree_width = self_width;
154  } else {
155  int sum = 0;
156  for (auto& id_child: children) {
157  sum += id_child.second.subtree_width();
158  }
159  if (sum > self_width) {
160  this->_subtree_width = sum;
161  } else {
162  this->_subtree_width = self_width;
163  }
164  }
165  }
166  return this->_subtree_width;
167  }
168 
169  void Node::print_subtree(std::ostream& os) {
170  for (auto& id_child: children) {
171  id_child.second.print_node_and_subtree(os);
172  }
173  }
174 
175  void Node::print_node_and_subtree(std::ostream& os) {
176  os << (*this) << std::endl;
177  this->print_subtree(os);
178  }
179 
180 
181  void
182  save_network_links(std::string fname,
183  std::map<std::size_t, std::size_t> network) {
184  Clustering::logger(std::cout) << "saving links" << std::endl;
185  std::ofstream ofs(fname);
186  if (ofs.fail()) {
187  std::cerr << "error: cannot open file '" << fname << "' for writing." << std::endl;
188  exit(EXIT_FAILURE);
189  } else {
190  for (auto p: network) {
191  ofs << p.second << " " << p.first << "\n";
192  }
193  }
194  }
195 
196  void
197  save_node_info(std::string fname,
198  std::map<std::size_t, float> free_energies,
199  std::map<std::size_t, std::size_t> pops) {
200  Clustering::logger(std::cout) << "saving nodes" << std::endl;
201  std::ofstream ofs(fname);
202  if (ofs.fail()) {
203  std::cerr << "error: cannot open file '" << fname << "' for writing." << std::endl;
204  exit(EXIT_FAILURE);
205  } else {
206  for (auto node_pop: pops) {
207  std::size_t key = node_pop.first;
208  ofs << key << " " << free_energies[key] << " " << node_pop.second << "\n";
209  }
210  }
211  }
212 
213  std::set<std::size_t>
214  compute_and_save_leaves(std::string fname,
215  std::map<std::size_t, std::size_t> network) {
216  Clustering::logger(std::cout) << "saving leaves, i.e. tree end nodes" << std::endl;
217  std::set<std::size_t> leaves;
218  std::set<std::size_t> not_leaves;
219  std::ofstream ofs(fname);
220  if (ofs.fail()) {
221  std::cerr << "error: cannot open file '" << fname << "' for writing." << std::endl;
222  exit(EXIT_FAILURE);
223  } else {
224  for (auto from_to: network) {
225  std::size_t src = from_to.first;
226  std::size_t target = from_to.second;
227  not_leaves.insert(target);
228  if (not_leaves.count(src)) {
229  leaves.erase(src);
230  } else {
231  leaves.insert(src);
232  }
233  }
234  for (std::size_t leaf: leaves) {
235  ofs << leaf << "\n";
236  }
237  }
238  return leaves;
239  }
240 
241  void
242  save_traj_of_leaves(std::string fname,
243  std::set<std::size_t> leaves,
244  float d_min,
245  float d_max,
246  float d_step,
247  std::string remapped_name,
248  std::size_t n_rows) {
249  Clustering::logger(std::cout) << "saving end-node trajectory for seeding" << std::endl;
250  std::ofstream ofs(fname);
251  if (ofs.fail()) {
252  std::cerr << "error: cannot open file '" << fname << "' for writing." << std::endl;
253  exit(EXIT_FAILURE);
254  } else {
255  std::vector<std::size_t> traj(n_rows);
256  const float prec = d_step / 10.0f;
257  for (float d=d_min; ! fuzzy_equal(d, d_max+d_step, prec); d += d_step) {
258  std::vector<std::size_t> cl_now = Clustering::Tools::read_clustered_trajectory(
259  Clustering::Tools::stringprintf(remapped_name, d));
260  for (std::size_t i=0; i < n_rows; ++i) {
261  if (leaves.count(cl_now[i])) {
262  traj[i] = cl_now[i];
263  }
264  }
265  }
266  for (std::size_t i: traj) {
267  ofs << i << "\n";
268  }
269  }
270  }
271 
272  void
273  save_network_to_html(std::string fname,
274  std::map<std::size_t, std::size_t> network,
275  std::map<std::size_t, float> free_energies,
276  std::map<std::size_t, std::size_t> pops) {
277  Clustering::logger(std::cout) << "computing network visualization" << std::endl;
278  // set (global) values for min/max of free energies and populations
279  FE_MAX = std::max_element(free_energies.begin(),
280  free_energies.end(),
281  [](std::pair<std::size_t, float> fe1,
282  std::pair<std::size_t, float> fe2) -> bool {
283  return fe1.second < fe2.second;
284  })->second;
285  FE_MIN = std::min_element(free_energies.begin(),
286  free_energies.end(),
287  [](std::pair<std::size_t, float> fe1,
288  std::pair<std::size_t, float> fe2) -> bool {
289  return fe1.second < fe2.second;
290  })->second;
291  POP_MAX = std::max_element(pops.begin(),
292  pops.end(),
293  [](std::pair<std::size_t, std::size_t> p1,
294  std::pair<std::size_t, std::size_t> p2) -> bool {
295  return p1.second < p2.second;
296  })->second;
297  POP_MIN = std::min_element(pops.begin(),
298  pops.end(),
299  [](std::pair<std::size_t, std::size_t> p1,
300  std::pair<std::size_t, std::size_t> p2) -> bool {
301  return p1.second < p2.second;
302  })->second;
303  // build trees from given network with respective 'root' on top and at highest FE.
304  // may be multiple trees because there may be multiple nodes that have max FE.
305  Node fake_root;
306  std::size_t network_size = network.size();
307  std::size_t i_frame = 0;
308  for (auto from_to: network) {
309  std::cerr << ++i_frame << " / " << network_size << "\n";
310 
311  std::size_t i_from = from_to.first;
312  std::size_t i_to = from_to.second;
313 
314  Node* parent_to = fake_root.find_parent_of(i_to);
315  if ( ! parent_to) {
316  fake_root.children[i_to] = {i_to, free_energies[i_to], pops[i_to]};
317  parent_to = &fake_root;
318  }
319  Node* parent_from = fake_root.find_parent_of(i_from);
320  if (parent_from) {
321  // move existing node to its right place in the tree
322  parent_to->children[i_to].children[i_from] = parent_from->children[i_from];
323  parent_from->children.erase(i_from);
324  } else {
325  // create child at proper place in tree
326  parent_to->children[i_to].children[i_from] = {i_from, free_energies[i_from], pops[i_from]};
327  }
328  }
329  // write header
330  std::ofstream ofs(fname);
331  if (ofs.fail()) {
332  std::cerr << "error: cannot open file '" << fname << "' for writing." << std::endl;
333  exit(EXIT_FAILURE);
334  } else {
335  float LOG_POP_MIN, LOG_POP_MAX;
336  if (POP_MIN <= 0) {
337  LOG_POP_MIN = 0.0f;
338  } else {
339  LOG_POP_MIN = log(POP_MIN);
340  }
341  if (POP_MAX <= 0) {
342  LOG_POP_MAX = 0.0f;
343  } else {
344  LOG_POP_MAX = log(POP_MAX);
345  }
346  ofs << Clustering::Network::viewer_header
347 
348  << "style: cytoscape.stylesheet().selector('node').css({"
349  << Clustering::Tools::stringprintf("'width': 'mapData(logpop, %0.2f, %0.2f, 5, 30)',", LOG_POP_MIN, LOG_POP_MAX)
350  << Clustering::Tools::stringprintf("'height': 'mapData(logpop, %0.2f, %0.2f, 5, 30)',", LOG_POP_MIN, LOG_POP_MAX)
351  << Clustering::Tools::stringprintf("'background-color': 'mapData(fe, %f, %f, blue, red)'})", FE_MIN, FE_MAX)
352 
353  << ".selector('edge').css({'opacity': '1.0', 'width': '5', 'target-arrow-shape': 'triangle'})"
354  << ".selector(':selected').css({'content': 'data(info)', 'font-size': 24, 'color': '#00ff00'})"
355 
356  << ", elements: [\n";
357  fake_root.set_pos(0, 0);
358  fake_root.print_subtree(ofs);
359  ofs << "]";
360  ofs << Clustering::Network::viewer_footer;
361  }
362  }
363 } // end local namespace
364 
365 
366 namespace Clustering {
367 namespace NetworkBuilder {
368 
369  void
370  main(boost::program_options::variables_map args) {
371  namespace b_fs = boost::filesystem;
372  using namespace Clustering::Tools;
373  // setup two threads parallel read/write
374  omp_set_num_threads(2);
375  // setup general flags / options
376  Clustering::verbose = args["verbose"].as<bool>();
377 
378  float d_min = args["min"].as<float>();
379  float d_max = args["max"].as<float>();
380  float d_step = args["step"].as<float>();
381  std::string basename = args["basename"].as<std::string>();
382  std::string remapped_name = "remapped_" + basename;
383  std::size_t minpop = args["minpop"].as<std::size_t>();
384 
385  std::map<std::size_t, std::size_t> network;
386  std::map<std::size_t, std::size_t> pops;
387  std::map<std::size_t, float> free_energies;
388 
389  std::string fname_next = stringprintf(basename, d_min);
390  if ( ! b_fs::exists(fname_next)) {
391  std::cerr << "error: file does not exist: " << fname_next << std::endl;
392  exit(EXIT_SUCCESS);
393  }
394  std::vector<std::size_t> cl_next = read_clustered_trajectory(fname_next);
395  std::vector<std::size_t> cl_now;
396  std::size_t max_id;
397  std::size_t n_rows = cl_next.size();
398  // re-map states to give every state a unique id.
399  // this is nevessary, since every initially clustered trajectory
400  // at different thresholds uses the same ids starting with 0.
401  const float prec = d_step / 10.0f;
402  if (d_max == 0.0f) {
403  // default: collect all until MAX_FE
404  d_max = std::numeric_limits<float>::max();
405  } else {
406  d_max += d_step;
407  }
408  float d;
409  for (d=d_min; ! fuzzy_equal(d, d_max, prec) && b_fs::exists(fname_next); d += d_step) {
410  Clustering::logger(std::cout) << "free energy level: " << stringprintf("%0.2f", d) << std::endl;
411  cl_now = cl_next;
412  fname_next = stringprintf(basename, d + d_step);
413  #pragma omp parallel sections
414  {
415  #pragma omp section
416  {
417  write_clustered_trajectory(stringprintf(remapped_name, d), cl_now);
418  }
419  #pragma omp section
420  {
421  if (b_fs::exists(fname_next)) {
422  cl_next = read_clustered_trajectory(fname_next);
423  max_id = *std::max_element(cl_now.begin(), cl_now.end());
424  for (std::size_t i=0; i < n_rows; ++i) {
425  if (cl_next[i] != 0) {
426  cl_next[i] += max_id;
427  if (cl_now[i] != 0) {
428  network[cl_now[i]] = cl_next[i];
429  ++pops[cl_now[i]];
430  free_energies[cl_now[i]] = d;
431  }
432  }
433  }
434  }
435  }
436  }
437  }
438  // set correct value for d_max for later reference
439  d_max = d-d_step;
440  // if minpop given: delete nodes and edges not fulfilling min. population criterium
441  if (minpop > 1) {
442  Clustering::logger(std::cout) << "cleaning from low pop. states ..." << std::endl;
443  std::unordered_set<std::size_t> removals;
444  auto pop_it = pops.begin();
445  Clustering::logger(std::cout) << " ... search nodes to remove" << std::endl;
446  while (pop_it != pops.end()) {
447  if (pop_it->second < minpop) {
448  removals.insert(pop_it->first);
449  pops.erase(pop_it++); // as above
450  } else {
451  ++pop_it;
452  }
453  }
454  Clustering::logger(std::cout) << " ... search edges to remove" << std::endl;
455  auto net_it = network.begin();
456  while (net_it != network.end()) {
457  std::size_t a = net_it->first;
458  std::size_t b = net_it->second;
459  if (removals.count(a) > 0 || removals.count(b) > 0) {
460  network.erase(net_it++);
461  } else {
462  ++net_it;
463  }
464  }
465  Clustering::logger(std::cout) << " ... finished." << std::endl;
466  }
467  // save links (i.e. edges) as two-column ascii in a child -> parent manner
468  save_network_links("network_links.dat", network);
469  // save id, population and free energy of nodes
470  save_node_info("network_nodes.dat", free_energies, pops);
471  // compute and directly save network end-nodes (i.e. leaves of the network-tree)
472  std::set<std::size_t> leaves = compute_and_save_leaves("network_leaves.dat", network);
473  // save the trajectory consisting of the 'leaf-states'.
474  // all non-leaf states are kept as non-assignment state '0'.
475  save_traj_of_leaves("network_end_node_traj.dat", leaves, d_min, d_max, d_step, remapped_name, n_rows);
476  // generate html-file with embedded javascript to visualize network
477  save_network_to_html("network_visualization.html", network, free_energies, pops);
478  }
479 } // end namespace NetworkBuilder
480 } // end namespace Clustering
481 
bool verbose
global flag: use verbose output?
Definition: logger.cpp:29
std::ostream & logger(std::ostream &s)
Definition: logger.cpp:32
void write_clustered_trajectory(std::string filename, std::vector< std::size_t > traj)
write state trajectory into plain text file
Definition: tools.cpp:73
std::vector< std::size_t > read_clustered_trajectory(std::string filename)
read states from trajectory (given as plain text file)
Definition: tools.cpp:54
std::string stringprintf(const std::string &str,...)
printf-version for std::string
Definition: tools.cpp:95