00001
00002
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef CLUSTER_H_
00026 #define CLUSTER_H_
00027
00028 #include "elements.h"
00029 #include "tree.h"
00030 #include "common.h"
00031
00032 using namespace std;
00033
00034 int create_tree(multimap<double, string>my_matrix);
00035 void create_supertree(list<Node *> & my_leaves );
00036
00037 void build_tree(Tree &newtree, list<element> cluster, double q) {
00038
00039
00040
00041 double q_ = 0;
00042 double new_q = q/(float)decay;
00043 bool equidistant = false;
00044 Node * temp_current = newtree.pointer_current();
00045 list<element> temp;
00046 list<element>::iterator e_itr = cluster.begin();
00047 bool first_pass = 1;
00048
00049
00050 double check_dist = (*e_itr).get_dist();
00051 while ( e_itr != cluster.end() ) {
00052
00053 if ( (*e_itr).get_dist() != check_dist)
00054 break;
00055 ++e_itr;
00056 }
00057 if ( e_itr == cluster.end() )
00058 equidistant = true;
00059 e_itr = cluster.begin();
00060
00061 while ( e_itr != cluster.end() ) {
00062
00063 if ( (*e_itr).get_dist() > q_ ) {
00064
00065 if ( equidistant ) {
00066 list<element>::iterator temp_itr = cluster.begin();
00067 while ( temp_itr != cluster.end() ) {
00068 temp.push_back((*e_itr));
00069 ++temp_itr;
00070 }
00071 }
00072 if ( !temp.empty() ) {
00073 if ( first_pass ) {
00074 newtree.insert(temp, 0);
00075 newtree.left();
00076 first_pass = 0;
00077 }
00078 else {
00079 newtree.insert(temp,1);
00080 newtree.right();
00081 }
00082 if ( (temp.size() > lambda) && (!equidistant) ) {
00083 Node * new_current = newtree.pointer_current();
00084 build_tree(newtree, temp, new_q);
00085 newtree.SetCurrent(new_current);
00086 }
00087
00088 temp.clear();
00089
00090 }
00091 q_ += new_q;
00092
00093 }
00094 else {
00095 temp.push_back((*e_itr));
00096 ++e_itr;
00097 }
00098
00099 }
00100 if ( !temp.empty() ) {
00101 if ( first_pass ) {
00102 newtree.insert(temp,0);
00103 newtree.left();
00104 }
00105 else {
00106 newtree.insert(temp, 1);
00107 newtree.right();
00108 }
00109 if ( (temp.size() > lambda) && (!equidistant) ) {
00110 Node * last = newtree.pointer_current();
00111 build_tree(newtree, temp, new_q);
00112 newtree.SetCurrent(last);
00113 }
00114 temp.clear();
00115 }
00116 nid temp_id = temp_current->id;
00117 nid my_id = newtree.pointer_current()->id;
00118 if ( DISPLAY ) {
00119 cout << "my id is: " << endl;
00120 for ( unsigned int i = 0; i < my_id.size(); ++i)
00121 cout << " | " << my_id[i];
00122 cout << endl;
00123 cout << "set parent to node with the following id: " << endl;
00124 for ( unsigned int i = 0; i < temp_id.size(); ++i)
00125 cout << " | " << temp_id[i];
00126 cout << endl;
00127 }
00128 newtree.SetCurrent(temp_current);
00129 }
00130
00131 int create_tree(multimap<double, string> my_matrix) {
00132
00133 vector<int> recognition(num_taxa, 0);
00134 list<element> clustered;
00135
00136 ofstream fout;
00137 fout.open("graph_me.txt");
00138 double q_heat = initial_q;
00139 fout << "for heating value of: " << q_heat << endl;
00140 fout << "cluster size" << endl;
00141 float q = 0;
00142
00143 multimap<double, string>::iterator m_itr = my_matrix.begin();
00144 int count_clusters = 1;
00145 int largest = 0;
00146 int smallest = 10000;
00147 bool first_pass = 1;
00148 Tree my_tree;
00149 my_tree.insert(clustered, 0);
00150 clustered.clear();
00151 while (m_itr != my_matrix.end()) {
00152
00153 if (m_itr->first > q ){
00154 if (!clustered.empty()) {
00155 if (first_pass) {
00156 my_tree.insert(clustered, 0);
00157 my_tree.left();
00158 first_pass = 0;
00159 }
00160 else {
00161 my_tree.insert(clustered, 1);
00162 my_tree.right();
00163 }
00164
00165 if ( clustered.size() > lambda ) {
00166 Node * reset_node = my_tree.pointer_current();
00167 build_tree(my_tree, clustered, q_heat);
00168 my_tree.SetCurrent(reset_node);
00169 }
00170
00171 fout << count_clusters << " " << clustered.size() << endl;
00172 if ( DISPLAY ) {
00173 cout << "cluster " << count_clusters << ": " << endl;
00174 list<element>::iterator itr = clustered.begin();
00175 while ( itr != clustered.end() ) {
00176 cout << (*itr).get_taxa() << endl;
00177 ++itr;
00178 }
00179 }
00180
00181 if (clustered.size() > largest) {
00182 largest = clustered.size();
00183 }
00184 if (clustered.size() < smallest) {
00185 smallest = clustered.size();
00186 }
00187
00188 ++count_clusters;
00189 clustered.clear();
00190 }
00191
00192 q+=q_heat;
00193
00194 }
00195 else {
00196 string temp = m_itr->second;
00197 string temp2;
00198 string::iterator t_itr= temp.begin();
00199 bool done = false;
00201 while ( t_itr != temp.end() ) {
00202
00203 if ( (*t_itr) == '.' ) {
00204 temp.erase(t_itr);
00205 done = true;
00206 break;
00207 }
00208 else {
00209 temp2.push_back((*t_itr));
00210 temp.erase(t_itr);
00211 }
00212 if ( done == true )
00213 break;
00214
00215 }
00216
00217 int first = 0, second = 0;
00218 stringstream ss1, ss2;
00219 ss1 << temp;
00220 ss1 >> first;
00221 ss2 << temp2;
00222 ss2 >> second;
00225 temp.clear(); temp2.clear();
00226 int tally = 0;
00227 if ( recognition[first-1] != 0 ) {
00228 tally += recognition[first-1];
00229
00230
00231 }
00232
00233
00234
00235 if ( recognition[second-1] != 0 ) {
00236 tally += recognition[second-1];
00237
00238
00239 }
00240
00241
00242
00243 if (tally < 2 ) {
00244 element temp(m_itr->second, m_itr->first);
00245 clustered.push_back(temp);
00246 recognition[first-1] += 1;
00247 recognition[second-1]+= 1;
00248 }
00249 ++m_itr;
00250 }
00251 }
00252
00253 if ( !clustered.empty() ) {
00254 if ( first_pass ) {
00255 my_tree.insert(clustered, 0);
00256 my_tree.left();
00257 }
00258 else {
00259 my_tree.insert(clustered, 1);
00260 my_tree.right();
00261 }
00262 if ( clustered.size() > lambda ) {
00263 Node * last = my_tree.pointer_current();
00264 build_tree(my_tree, clustered, q_heat);
00265 my_tree.SetCurrent(last);
00266 }
00267 clustered.clear();
00268 }
00270 cout << "Some statistics (for initial cluster size)..." << endl;
00271 cout << "number of clusters: " << count_clusters << endl;
00272 cout << "largest cluster size is: " << largest << endl;
00273 cout << "smallest cluster size is: " << smallest << endl;
00275 cout << "unclustered elements: " << endl;
00276 for (unsigned int i = 0; i < recognition.size(); ++i) {
00277 if ( recognition[i] < 1 )
00278 cout << i+1 << endl;
00279 }
00280 cout << endl;
00281 my_tree.reset();
00282 if ( DISPLAY ) {
00283 show_leaves(my_tree.pointer_current());
00284 my_tree.reset();
00285 }
00286 list<Node *> my_leaves;
00287 find_leaves(my_leaves, my_tree.pointer_current());
00288 if ( DISPLAY ) {
00289 cout << endl << endl << endl << "LEAF NODES: " << endl;
00290 list<Node *>::iterator l_itr = my_leaves.begin();
00291 while ( l_itr != my_leaves.end() ) {
00292 display_node(*l_itr);
00293 ++l_itr;
00294 }
00295 cout << endl << endl << endl << "ALL NODES: " << endl;
00297 my_tree.DisplayPreorder(my_tree.pointer_current());
00298 }
00299 create_supertree(my_leaves);
00300 cout << "super tree creation complete." << endl;
00301 return 0;
00302 }
00303
00304
00305 vector<int> separate(string taxon) {
00306 vector<int> my_taxa;
00307 int i = 0;
00308 string s1, s2;
00309 while (taxon[i] != '.') {
00310 s1.push_back(taxon[i]);
00311 ++i;
00312 }
00313 if ( taxon[i] == '.' )
00314 ++i;
00315 for ( i; i < taxon.size(); ++i ) {
00316 s2.push_back(taxon[i]);
00317 }
00318
00319 int first = 0, second = 0;
00320 stringstream ss1, ss2;
00321 ss1 << s1;
00322 ss1 >> first;
00323 ss2 << s2;
00324 ss2 >> second;
00325
00326 my_taxa.push_back(first);
00327 my_taxa.push_back(second);
00328
00329 return my_taxa;
00330 }
00331
00332 map<int, list<int> > create_hash(list<Node *> my_leaves) {
00333
00334 map<int, list<int> > hash_map;
00335 list<Node *>::iterator sift = my_leaves.begin();
00336 while ( sift != my_leaves.end() ) {
00337 list<element> clust = (*sift)->data;
00338 list<element>::iterator c_itr = clust.begin();
00339 while ( c_itr != clust.end() ) {
00340 string temp = (*c_itr).get_taxa();
00341 vector<int> taxas = separate(temp);
00342 for ( unsigned int i = 0; i < taxas.size(); ++i) {
00343 map<int, list<int> >::iterator hasher;
00344 hasher = hash_map.find(taxas[i]);
00345 if ( hasher != hash_map.end() ) {
00346 if ( i == 0 ) {
00347 hash_map[taxas[i]].push_back(taxas[1]);
00348 }
00349 else {
00350 hash_map[taxas[i]].push_back(taxas[0]);
00351 }
00352 }
00353 else {
00354 list<int> new_list;
00355 if ( i == 0 ) {
00356 new_list.push_back(taxas[i+1]);
00357 }
00358 else
00359 new_list.push_back(taxas[i-1]);
00360 hash_map.insert( make_pair(taxas[i],new_list) );
00361 }
00362 }
00363 ++c_itr;
00364 }
00365 ++sift;
00366 }
00367
00368 return hash_map;
00369 }
00370
00371 map<vector<short>, list<SuperTree> > create_superhash(list<SuperTree> subtrees, int & longest) {
00373 map<vector<short>, list<SuperTree> > hash_map;
00374 map<vector<short>, list<SuperTree> >::iterator hasher;
00375 list<SuperTree>::iterator place = subtrees.begin();
00376 longest = 0;
00377 while ( place != subtrees.end() ) {
00378 (*place).reset();
00379 vector<short> id = (*place).Id();
00380 if (id.size() > longest)
00381 longest = id.size();
00382 hasher = hash_map.find(id);
00383 if ( hasher != hash_map.end() ) {
00384 hash_map[id].push_back((*place));
00385 }
00386 else {
00387 list<SuperTree> new_list;
00388 new_list.push_back((*place));
00389 hash_map.insert(make_pair(id, new_list));
00390 }
00391 ++place;
00392 }
00393
00394 return hash_map;
00395 }
00396
00397 void merge(int starter, map<int, list<int> > & my_map, vector<int> & product) {
00398 bool push = true;
00399 for ( unsigned int i = 0; i < product.size(); ++i) {
00400 if ( product[i] == starter)
00401 push = false;
00402 }
00403 if (push)
00404 product.push_back(starter);
00405
00406 if ( my_map[starter].empty() )
00407 return;
00408 else {
00409 while ( !my_map[starter].empty() ) {
00410 int new_starter = my_map[starter].front();
00411 my_map[starter].pop_front();
00412 merge(new_starter, my_map, product);
00413 }
00414 }
00415 }
00416
00417 void create_subtrees(vector<helper> help, list<SuperTree> & all_trees) {
00418 for ( unsigned int i = 0; i < help.size(); ++i ) {
00419 vector<int> temp_tree = help[i].get_tree();
00420 vector<short> temp_id = help[i].get_id();
00421 int internal = temp_tree.size() - 1;
00422 SuperTree new_tree;
00423 new_tree.insert(0, 0, temp_id, true);
00424 int taxa_ = temp_tree.back();
00425 temp_tree.pop_back();
00426 new_tree.insert(taxa_, 1, temp_id, false);
00427 internal--;
00428 while ( internal > 0 ) {
00429 new_tree.insert(-1, 0, temp_id, true);
00430 new_tree.left();
00431 taxa_ = temp_tree.back();
00432 temp_tree.pop_back();
00433 new_tree.insert(taxa_, 1, temp_id, false);
00434 internal--;
00435 }
00436 taxa_ = temp_tree.back();
00437 temp_tree.pop_back();
00438 new_tree.insert(taxa_, 0, temp_id, false);
00439 new_tree.left();
00440
00441 new_tree.reset();
00442 all_trees.push_back(new_tree);
00443 new_tree.clear();
00444 }
00445 return;
00446 }
00447
00449 void display_hash( map<vector<short>, list<SuperTree> > merge_hash ) {
00450 map<vector<short>, list<SuperTree> >::iterator show = merge_hash.begin();
00451 while ( show != merge_hash.end() ) {
00452 vector<short> temp_id = show->first;
00453 for ( int i = 0; i < temp_id.size(); ++i) {
00454 cout << temp_id[i] << "|";
00455 }
00456 cout << " :||: ";
00457 list<SuperTree> temp = show->second;
00458 list<SuperTree>::iterator stuff = temp.begin();
00459 while ( stuff != temp.end() ) {
00460 (*stuff).DisplayInorder( (*stuff).root() );
00461 ++stuff;
00462 cout << "**********************************************" << endl;
00463 }
00464 cout << endl;
00465
00466 ++show;
00467 }
00468 }
00469
00471 void merge_supertrees(map<vector<short>, list<SuperTree> > hash, int longest ) {
00473
00475 while ( longest > 1 && hash.size() != 1 ) {
00476 map<vector<short>, list<SuperTree> >::iterator merger = hash.begin();
00477 map<vector<short>, list<SuperTree> >::iterator finder;
00478 while ( merger != hash.end() ) {
00479 if ( merger->second.size() > 1 ) {
00480 list<SuperTree> temp = merger->second;
00481 merge_trees(temp);
00482 merger->second.clear();
00483 merger->second = temp;
00484 temp.clear();
00485 }
00486 ++merger;
00487 }
00488
00489 if ( DISPLAY )
00490 display_hash(hash);
00491
00493 merger = hash.begin();
00494 if ( DISPLAY )
00495 cout << "STAGE TWO OF MERGE!!!!" << endl;
00496 while ( merger != hash.end() ) {
00497 if ( merger->first.size() == longest) {
00498 vector<short> temp_id = merger->first;
00499 list<SuperTree> temp_list = merger->second;
00500 map<vector<short>, list<SuperTree> >::iterator temp_itr = merger;
00501 ++merger;
00502 hash.erase(temp_itr);
00504 temp_id.pop_back();
00505 temp_list.front().root()->id.pop_back();
00506 finder = hash.find(temp_id);
00507 if ( finder != hash.end() ) {
00508 hash[temp_id].push_back(temp_list.front());
00509 }
00510 else {
00511 hash.insert(make_pair(temp_id, temp_list));
00512 }
00513 }
00514 else
00515 ++merger;
00516 }
00517 --longest;
00518
00519 if ( DISPLAY ) {
00520 display_hash(hash);
00521 cout << "longest is: " << longest << endl;
00522 cout << "hash table's size is: " << hash.size() << endl;
00523 }
00524 }
00525
00526 map<vector<short>, list<SuperTree> >::iterator merger = hash.begin();
00527 while ( merger != hash.end() ) {
00528 if ( merger->second.size() > 1 ) {
00529 list<SuperTree> temp = merger->second;
00530 merge_trees(temp);
00531 merger->second.clear();
00532 merger->second = temp;
00533 temp.clear();
00534 }
00535 ++merger;
00536 }
00537
00538 merger = hash.begin();
00539 SuperTree checker = merger->second.front();
00540 list<string> print;
00541 checker.Newick(print, checker.root());
00542 print.push_back(";");
00543 cout << "The outputted tree is:" << endl;
00544 ofstream tout;
00545 tout.open("newick.txt");
00546 int count_taxa = 0;
00547 list<string>::iterator show = print.begin();
00548 while (show != print.end() ) {
00549 cout << (*show) << " ";
00550 tout << (*show);
00551 if ( ((*show)!=")") && ((*show)!="(") && ((*show)!=",") ) {
00552 ++count_taxa;
00553 }
00554 ++show;
00555 }
00556 cout << endl;
00557 tout.close();
00558 cout << "we have successfully created a tree with " << num_taxa << " taxa." << endl;
00559
00560 }
00561
00562 void create_supertree(list<Node *> & my_leaves ) {
00563
00564 list<SuperTree> subtrees;
00565
00567 map<int, list<int> > hash_map;
00568 hash_map = create_hash(my_leaves);
00569
00570
00571
00572 if ( DISPLAY ) {
00573 map<int, list<int> >::iterator show = hash_map.begin();
00574 while ( show != hash_map.end() ) {
00575 cout << show->first << " | ";
00576 list<int> temp = show->second;
00577 list<int>::iterator stuff = temp.begin();
00578 while ( stuff != temp.end() ) {
00579 cout << (*stuff) << " ";
00580 ++stuff;
00581 }
00582 cout << endl;
00583 ++show;
00584 }
00585 }
00586
00587 list<Node *>::iterator rake = my_leaves.begin();
00588 vector< helper > temp_clusters;
00589 while ( rake != my_leaves.end() ) {
00590 list<element> group = (*rake)->data;
00591 vector<short> id = (*rake)->id;
00592 list<element>::iterator sorter = group.begin();
00593 while ( sorter != group.end() ) {
00594 string taxa = (*sorter).get_taxa();
00595 vector<int> taxon = separate(taxa);
00596 vector<int> merged;
00597 if (!hash_map[taxon[0]].empty()) {
00598 merge(taxon[0], hash_map, merged);
00599 if ( DISPLAY) {
00600 cout << "merged set: " << endl;
00601 for ( unsigned int i = 0; i < merged.size(); ++i) {
00602 cout << merged[i] << ".";
00603 }
00604 cout << endl;
00605 }
00606 helper tree_guide(id, merged);
00607 temp_clusters.push_back(tree_guide);
00608 }
00609 ++sorter;
00610 }
00611 ++rake;
00612 }
00613
00614 if ( DISPLAY ) {
00615 for ( unsigned int i = 0; i < temp_clusters.size(); ++i ) {
00616 temp_clusters[i].display();
00617 }
00618 }
00619
00620 create_subtrees(temp_clusters, subtrees);
00621 cout << "sub-tree creation complete." << endl;
00622
00624 map<vector<short>, list<SuperTree> > merge_hash;
00625 int longest = 0;
00626 merge_hash = create_superhash(subtrees, longest);
00627
00628 if ( DISPLAY ) {
00629 cout << "longest id is: " << longest << endl;
00630 cout << "Printing out merge_hash!" << endl;
00631 display_hash(merge_hash);
00632 }
00633 merge_supertrees(merge_hash, longest);
00634
00635 return;
00636 }
00637
00638 #endif