cluster.h

00001 
00002 
00007 //(c) 2006 Suzanne Matthews
00008 /*
00009 This file is part of the RaqApproach software package.
00010 
00011     RaqApproach is free software; you can redistribute it and/or modify
00012     it under the terms of the GNU General Public License as published by
00013     the Free Software Foundation; either version 2 of the License, or
00014     (at your option) any later version.
00015 
00016     RaqApproach is distributed in the hope that it will be useful,
00017     but WITHOUT ANY WARRANTY; without even the implied warranty of
00018     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00019     GNU General Public License for more details.
00020 
00021     You should have received a copy of the GNU General Public License
00022     along with RaqApproach; if not, write to the Free Software
00023     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
00024 */
00025 #ifndef CLUSTER_H_
00026 #define CLUSTER_H_
00027 
00028 #include "elements.h"
00029 #include "tree.h"
00030 #include "common.h"
00031 
00032 using namespace std;
00033 
00034 int create_tree(multimap<double, string>my_matrix); 
00035 void create_supertree(list<Node *> & my_leaves ); 
00036 
00037 void build_tree(Tree &newtree, list<element> cluster, double q) { 
00038 /*using a lambda value (of 2), the buid_tree() function recursively breaks down the current
00039   element set and add it to our decomposition
00040   tree*/
00041         double q_ = 0; 
00042         double new_q = q/(float)decay; 
00043         bool equidistant = false; 
00044         Node * temp_current = newtree.pointer_current();
00045         list<element> temp;
00046         list<element>::iterator e_itr = cluster.begin();
00047         bool first_pass = 1; 
00048 
00049 
00050         double check_dist = (*e_itr).get_dist();
00051         while ( e_itr != cluster.end() ) { 
00052 
00053                 if ( (*e_itr).get_dist() != check_dist)
00054                         break;
00055                 ++e_itr;
00056         }
00057         if ( e_itr == cluster.end() ) 
00058                 equidistant = true; 
00059         e_itr = cluster.begin();
00060 
00061         while ( e_itr != cluster.end() ) { 
00062 
00063                 if ( (*e_itr).get_dist() > q_ ) { 
00064 
00065              if ( equidistant ) { 
00066                                  list<element>::iterator temp_itr =  cluster.begin();
00067                                  while ( temp_itr != cluster.end() ) {
00068                                          temp.push_back((*e_itr)); 
00069                                          ++temp_itr;
00070                                  }
00071                          }
00072                         if ( !temp.empty() ) {
00073                                 if ( first_pass ) { 
00074                                         newtree.insert(temp, 0); 
00075                                         newtree.left();
00076                                         first_pass = 0; 
00077                                 }
00078                                 else {
00079                                         newtree.insert(temp,1); 
00080                                         newtree.right();
00081                                 }
00082                                 if ( (temp.size() > lambda) && (!equidistant) ) { 
00083                                         Node * new_current = newtree.pointer_current();
00084                                         build_tree(newtree, temp, new_q); 
00085                                         newtree.SetCurrent(new_current);
00086                                 }
00087 
00088                                 temp.clear(); 
00089 
00090                         }
00091                         q_ += new_q; 
00092 
00093                 } 
00094                 else {
00095                         temp.push_back((*e_itr)); 
00096                         ++e_itr;
00097                 }
00098 
00099         } 
00100         if ( !temp.empty() ) { 
00101                  if ( first_pass ) { 
00102                          newtree.insert(temp,0); 
00103                          newtree.left();
00104                  }
00105                  else {
00106                         newtree.insert(temp, 1); 
00107                         newtree.right();
00108                 }
00109                 if ( (temp.size() > lambda) && (!equidistant) ) { 
00110                         Node * last = newtree.pointer_current();
00111                         build_tree(newtree, temp, new_q); 
00112                         newtree.SetCurrent(last);
00113                 }
00114                 temp.clear(); 
00115         }
00116         nid temp_id = temp_current->id;
00117         nid my_id = newtree.pointer_current()->id;
00118         if ( DISPLAY ) {
00119                 cout << "my id is: " << endl;
00120                 for ( unsigned int i = 0; i < my_id.size(); ++i)
00121                                 cout << " | " << my_id[i];
00122                 cout << endl;
00123                 cout << "set parent to node with the following id: " << endl;
00124                 for ( unsigned int i = 0; i < temp_id.size(); ++i)
00125                         cout << " | " << temp_id[i];
00126                 cout << endl;
00127         }
00128         newtree.SetCurrent(temp_current); //set the current node back up to the node where the recursion began
00129 } //end function
00130 
00131 int create_tree(multimap<double, string> my_matrix) {
00132 
00133   vector<int> recognition(num_taxa, 0); 
00134   list<element> clustered; 
00135   /*create_tree accepts an inputted matrix and from it creates a tree structure in newick format*/
00136   ofstream fout;
00137   fout.open("graph_me.txt"); 
00138   double q_heat = initial_q; 
00139   fout << "for heating value of: " << q_heat << endl;
00140   fout << "cluster size" << endl;
00141   float q = 0; 
00142 
00143   multimap<double, string>::iterator m_itr = my_matrix.begin();
00144   int count_clusters = 1;
00145   int largest = 0;
00146   int smallest = 10000; 
00147   bool first_pass = 1;
00148   Tree my_tree;  
00149   my_tree.insert(clustered, 0);  
00150   clustered.clear();
00151   while (m_itr != my_matrix.end()) {  
00152 
00153     if (m_itr->first > q ){ 
00154       if (!clustered.empty()) { 
00155          if (first_pass) { 
00156                           my_tree.insert(clustered, 0); 
00157                           my_tree.left(); 
00158                           first_pass = 0; 
00159                   }
00160                   else {
00161                           my_tree.insert(clustered, 1); 
00162                           my_tree.right(); 
00163                         }
00164 
00165                   if ( clustered.size() > lambda ) { 
00166                           Node * reset_node = my_tree.pointer_current();
00167                           build_tree(my_tree, clustered, q_heat); 
00168                           my_tree.SetCurrent(reset_node);
00169                   }
00170 
00171           fout << count_clusters << " " << clustered.size() << endl;
00172                   if ( DISPLAY ) {
00173                           cout << "cluster " << count_clusters << ": " << endl;
00174                           list<element>::iterator itr = clustered.begin();
00175                           while ( itr != clustered.end() ) {
00176                                   cout << (*itr).get_taxa() << endl;
00177                                   ++itr;
00178                           }
00179                   }
00180 
00181                   if (clustered.size() > largest) { 
00182                           largest = clustered.size();
00183                   }
00184                   if (clustered.size() < smallest) { 
00185                           smallest = clustered.size();
00186                   }
00187 
00188                   ++count_clusters; 
00189                   clustered.clear(); 
00190          }
00191 
00192      q+=q_heat; 
00193 
00194     }
00195     else { 
00196                 string temp = m_itr->second;
00197                 string temp2;
00198                 string::iterator t_itr= temp.begin();
00199                 bool done = false;
00201                 while ( t_itr != temp.end() ) {
00202 
00203                         if ( (*t_itr) == '.' ) {
00204                                 temp.erase(t_itr);
00205                                 done = true;
00206                                 break;
00207                         }
00208                         else {
00209                                 temp2.push_back((*t_itr));
00210                                 temp.erase(t_itr);
00211                         }
00212                         if ( done == true )
00213                                 break;
00214 
00215                 }
00216 
00217                 int first = 0, second = 0;
00218                 stringstream ss1, ss2;
00219                 ss1 << temp;
00220                 ss1 >> first;
00221                 ss2 << temp2;
00222                 ss2 >> second;
00225                 temp.clear(); temp2.clear();
00226                 int tally = 0; 
00227                 if ( recognition[first-1] != 0 ) {
00228                         tally += recognition[first-1];
00229                         //if ( recognition[first-1] > 2 )
00230                         //      recognition[first-1] == 2;
00231                 }
00232                 //else
00233                         //recognition[first-1] = recognition[first-1] + 1;
00234 
00235                 if ( recognition[second-1] != 0 ) {
00236                         tally += recognition[second-1];
00237                         //if ( recognition[second-1] > 2 )
00238                         //      recognition[second-1] == 2;
00239                 }
00240                 //else
00241                         //recognition[second-1] = recognition[second-1] + 1;
00242 
00243                 if (tally < 2 ) { 
00244                         element temp(m_itr->second, m_itr->first);
00245                         clustered.push_back(temp);
00246                         recognition[first-1] += 1;
00247                         recognition[second-1]+= 1;
00248                 }
00249             ++m_itr; 
00250     }
00251   } 
00252 
00253   if ( !clustered.empty() ) { 
00254           if ( first_pass ) { 
00255                   my_tree.insert(clustered, 0); 
00256                   my_tree.left();
00257           }
00258           else { 
00259                 my_tree.insert(clustered, 1);
00260                 my_tree.right();
00261           }
00262           if ( clustered.size() > lambda ) { 
00263                 Node * last = my_tree.pointer_current();
00264                 build_tree(my_tree, clustered, q_heat); 
00265                 my_tree.SetCurrent(last);
00266           }
00267           clustered.clear(); 
00268   }
00270   cout << "Some statistics (for initial cluster size)..." << endl;
00271   cout << "number of clusters: " << count_clusters << endl;
00272   cout << "largest cluster size is: " << largest << endl;
00273   cout << "smallest cluster size is: " << smallest << endl;
00275   cout << "unclustered elements: " << endl;
00276   for (unsigned int i = 0; i < recognition.size(); ++i) {
00277           if ( recognition[i] < 1 )
00278                 cout << i+1 << endl;
00279         }
00280   cout << endl;
00281   my_tree.reset();
00282   if ( DISPLAY ) {
00283         show_leaves(my_tree.pointer_current()); 
00284         my_tree.reset();
00285   }
00286   list<Node *> my_leaves;
00287   find_leaves(my_leaves, my_tree.pointer_current()); 
00288   if ( DISPLAY ) {
00289           cout << endl << endl << endl << "LEAF NODES: " << endl;
00290           list<Node *>::iterator l_itr = my_leaves.begin(); 
00291           while ( l_itr != my_leaves.end() ) {
00292                   display_node(*l_itr);
00293                   ++l_itr;
00294           }
00295           cout << endl << endl << endl << "ALL NODES: " << endl;
00297           my_tree.DisplayPreorder(my_tree.pointer_current());
00298   }
00299   create_supertree(my_leaves); 
00300   cout << "super tree creation complete." << endl;
00301   return 0;
00302 }
00303 
00304 
00305 vector<int> separate(string taxon) { 
00306         vector<int> my_taxa;
00307         int i = 0;
00308         string s1, s2;
00309         while (taxon[i] != '.') {
00310                 s1.push_back(taxon[i]);
00311                 ++i;
00312         }
00313         if ( taxon[i] == '.' )
00314                 ++i;
00315         for ( i; i < taxon.size(); ++i ) {
00316                 s2.push_back(taxon[i]);
00317         }
00318 
00319         int first = 0, second = 0;
00320         stringstream ss1, ss2;
00321         ss1 << s1;
00322         ss1 >> first;
00323         ss2 << s2;
00324     ss2 >> second;
00325 
00326         my_taxa.push_back(first);
00327         my_taxa.push_back(second);
00328 
00329         return my_taxa; 
00330 }
00331 
00332 map<int, list<int> > create_hash(list<Node *> my_leaves) { 
00333 
00334         map<int, list<int> > hash_map; 
00335         list<Node *>::iterator sift = my_leaves.begin();
00336         while ( sift != my_leaves.end() ) { 
00337                 list<element> clust = (*sift)->data; 
00338                 list<element>::iterator c_itr = clust.begin();
00339                 while ( c_itr != clust.end() ) { 
00340                         string temp = (*c_itr).get_taxa();
00341                         vector<int> taxas = separate(temp); 
00342                         for ( unsigned int i = 0; i < taxas.size(); ++i) {
00343                                 map<int, list<int> >::iterator hasher;
00344                                 hasher = hash_map.find(taxas[i]); 
00345                                 if ( hasher != hash_map.end() ) { 
00346                                         if ( i == 0 ) { 
00347                                                 hash_map[taxas[i]].push_back(taxas[1]); 
00348                                         }
00349                                         else {
00350                                                 hash_map[taxas[i]].push_back(taxas[0]); 
00351                                         }
00352                                 }
00353                                 else { 
00354                                         list<int> new_list;
00355                                         if ( i == 0 ) {
00356                                                 new_list.push_back(taxas[i+1]);
00357                                         }
00358                                         else
00359                                                 new_list.push_back(taxas[i-1]);
00360                                         hash_map.insert( make_pair(taxas[i],new_list) );
00361                                 }
00362                         }
00363                         ++c_itr; 
00364                 }
00365                 ++sift; 
00366         }
00367 
00368         return hash_map; 
00369 }
00370 
00371 map<vector<short>, list<SuperTree> > create_superhash(list<SuperTree> subtrees, int &  longest) {
00373         map<vector<short>, list<SuperTree> > hash_map;
00374         map<vector<short>, list<SuperTree> >::iterator hasher;
00375         list<SuperTree>::iterator place = subtrees.begin();
00376         longest = 0; 
00377         while ( place != subtrees.end() ) {
00378                 (*place).reset(); 
00379                 vector<short> id = (*place).Id(); 
00380                 if (id.size() > longest)
00381                         longest = id.size(); 
00382                 hasher = hash_map.find(id); 
00383                 if ( hasher != hash_map.end() ) { 
00384                         hash_map[id].push_back((*place)); 
00385                 }
00386                 else { 
00387                         list<SuperTree> new_list;
00388                         new_list.push_back((*place));
00389                         hash_map.insert(make_pair(id, new_list)); 
00390                 }
00391                 ++place;
00392         }
00393 
00394         return hash_map; 
00395 }
00396 
00397 void merge(int starter, map<int, list<int> > & my_map, vector<int> & product) { 
00398         bool push = true;
00399         for ( unsigned int i = 0; i < product.size(); ++i) { 
00400                 if ( product[i] == starter)
00401                 push = false;
00402         }
00403         if (push)
00404                 product.push_back(starter); 
00405 
00406         if ( my_map[starter].empty() ) 
00407                 return;
00408         else {
00409                 while ( !my_map[starter].empty() ) { 
00410                         int new_starter = my_map[starter].front(); 
00411                         my_map[starter].pop_front(); 
00412                         merge(new_starter, my_map, product); 
00413                 }
00414         }
00415 }
00416 
00417 void create_subtrees(vector<helper> help, list<SuperTree> & all_trees) { 
00418         for ( unsigned int i = 0; i < help.size(); ++i ) {
00419                 vector<int> temp_tree = help[i].get_tree(); 
00420                 vector<short> temp_id = help[i].get_id(); 
00421                 int internal = temp_tree.size() - 1; 
00422                 SuperTree new_tree;
00423                 new_tree.insert(0, 0, temp_id, true); 
00424                 int taxa_ = temp_tree.back();
00425                 temp_tree.pop_back();
00426                 new_tree.insert(taxa_, 1, temp_id, false); 
00427                 internal--; 
00428                 while ( internal > 0 ) { 
00429                         new_tree.insert(-1, 0, temp_id, true); 
00430                         new_tree.left();
00431                         taxa_ = temp_tree.back(); 
00432                         temp_tree.pop_back(); 
00433                         new_tree.insert(taxa_, 1, temp_id, false); 
00434                         internal--; 
00435                 }
00436                 taxa_ = temp_tree.back();
00437                 temp_tree.pop_back();
00438                 new_tree.insert(taxa_, 0, temp_id, false); 
00439                 new_tree.left();
00440                 //cout << endl;
00441                 new_tree.reset();
00442                 all_trees.push_back(new_tree); 
00443                 new_tree.clear(); 
00444         }
00445         return;
00446 }
00447 
00449 void display_hash( map<vector<short>, list<SuperTree> > merge_hash ) {
00450         map<vector<short>, list<SuperTree> >::iterator show = merge_hash.begin();
00451                 while ( show != merge_hash.end() ) {
00452                         vector<short> temp_id = show->first;
00453                         for ( int i = 0; i < temp_id.size(); ++i) {
00454                                 cout << temp_id[i] << "|";
00455                         }
00456                         cout << " :||: ";
00457                         list<SuperTree> temp = show->second;
00458                         list<SuperTree>::iterator stuff = temp.begin();
00459                         while ( stuff != temp.end() ) {
00460                                 (*stuff).DisplayInorder( (*stuff).root() );
00461                                 ++stuff;
00462                                 cout << "**********************************************" << endl;
00463                         }
00464                         cout << endl;
00465 
00466                         ++show;
00467         }
00468 }
00469 
00471 void merge_supertrees(map<vector<short>, list<SuperTree> > hash, int longest ) {
00473 
00475         while ( longest > 1 && hash.size() != 1 ) {
00476                 map<vector<short>, list<SuperTree> >::iterator merger = hash.begin();
00477                 map<vector<short>, list<SuperTree> >::iterator finder;
00478                 while ( merger != hash.end() ) {
00479                         if ( merger->second.size() > 1 ) { 
00480                                 list<SuperTree> temp = merger->second;
00481                                 merge_trees(temp); 
00482                                 merger->second.clear();
00483                                 merger->second = temp;
00484                                 temp.clear();
00485                         }
00486                         ++merger;
00487                 }
00488 
00489                 if ( DISPLAY ) 
00490                         display_hash(hash);
00491 
00493                 merger = hash.begin();
00494                 if ( DISPLAY )
00495                         cout << "STAGE TWO OF MERGE!!!!" << endl;
00496                 while ( merger != hash.end() ) {
00497                         if ( merger->first.size() == longest) { 
00498                                 vector<short> temp_id = merger->first;
00499                                 list<SuperTree> temp_list = merger->second;
00500                                 map<vector<short>, list<SuperTree> >::iterator temp_itr = merger;
00501                                 ++merger;
00502                                 hash.erase(temp_itr);
00504                                 temp_id.pop_back(); 
00505                                 temp_list.front().root()->id.pop_back(); 
00506                                 finder = hash.find(temp_id); 
00507                                 if ( finder != hash.end() ) {
00508                                         hash[temp_id].push_back(temp_list.front()); 
00509                                 }
00510                                 else {
00511                                         hash.insert(make_pair(temp_id, temp_list)); 
00512                                 }
00513                         }
00514                         else
00515                                 ++merger; 
00516                 }
00517                 --longest; 
00518 
00519                 if ( DISPLAY ) {
00520                         display_hash(hash);
00521                         cout << "longest is: " << longest << endl;
00522                         cout << "hash table's size is: " << hash.size() << endl;
00523                 }
00524         }
00525 
00526     map<vector<short>, list<SuperTree> >::iterator merger = hash.begin();
00527     while ( merger != hash.end() ) { 
00528                 if ( merger->second.size() > 1 ) { 
00529                         list<SuperTree> temp = merger->second;
00530                         merge_trees(temp); 
00531                         merger->second.clear();
00532                         merger->second = temp;
00533                         temp.clear();
00534                 }
00535                 ++merger; 
00536         }
00537 
00538         merger = hash.begin();
00539         SuperTree checker = merger->second.front();
00540         list<string> print;
00541         checker.Newick(print, checker.root()); 
00542         print.push_back(";");
00543         cout << "The outputted tree is:" << endl;
00544         ofstream tout;
00545         tout.open("newick.txt");
00546         int count_taxa = 0;
00547         list<string>::iterator show = print.begin();
00548         while (show != print.end() ) {
00549                 cout << (*show) << " ";
00550                 tout << (*show); 
00551                 if ( ((*show)!=")") && ((*show)!="(") && ((*show)!=",") ) {
00552                         ++count_taxa;
00553                 }
00554                 ++show;
00555         }
00556         cout << endl;
00557         tout.close();
00558         cout << "we have successfully created a tree with " << num_taxa << " taxa." << endl;
00559 
00560 }
00561 
00562 void create_supertree(list<Node *> & my_leaves ) { 
00563 
00564         list<SuperTree> subtrees;
00565 
00567         map<int, list<int> > hash_map;
00568         hash_map = create_hash(my_leaves); 
00569 
00570         //debugging code: print out hash_map:
00571         //cout << "Printing out hash_map!" << endl;
00572         if  ( DISPLAY ) {
00573                 map<int, list<int> >::iterator show = hash_map.begin();
00574                 while ( show != hash_map.end() ) {
00575                         cout << show->first << " | ";
00576                         list<int> temp = show->second;
00577                         list<int>::iterator stuff = temp.begin();
00578                         while ( stuff != temp.end() ) {
00579                                 cout << (*stuff) << " ";
00580                                 ++stuff;
00581                         }
00582                         cout << endl;
00583                         ++show;
00584                 }
00585         }
00586 
00587         list<Node *>::iterator rake = my_leaves.begin();
00588         vector< helper > temp_clusters; 
00589         while ( rake != my_leaves.end() ) { 
00590                 list<element> group = (*rake)->data; 
00591                 vector<short> id = (*rake)->id;
00592                 list<element>::iterator sorter = group.begin();
00593                 while ( sorter != group.end() ) {
00594                         string taxa = (*sorter).get_taxa(); 
00595                         vector<int> taxon = separate(taxa); 
00596                         vector<int> merged;
00597                         if (!hash_map[taxon[0]].empty()) {
00598                                 merge(taxon[0], hash_map, merged); 
00599                                 if ( DISPLAY) {
00600                                         cout << "merged set: " << endl;
00601                                         for ( unsigned int i = 0; i < merged.size(); ++i) {
00602                                                 cout << merged[i] << ".";
00603                                         }
00604                                         cout << endl;
00605                                 }
00606                                 helper tree_guide(id, merged); 
00607                                 temp_clusters.push_back(tree_guide);
00608                         }
00609                         ++sorter; 
00610                 }
00611                 ++rake; 
00612         }
00613 
00614     if ( DISPLAY ) { 
00615                 for ( unsigned int i = 0; i < temp_clusters.size(); ++i ) {
00616                         temp_clusters[i].display();
00617                 }
00618         }
00619 
00620         create_subtrees(temp_clusters, subtrees); 
00621     cout << "sub-tree creation complete." << endl;
00622 
00624         map<vector<short>, list<SuperTree> > merge_hash;
00625         int longest = 0;
00626         merge_hash = create_superhash(subtrees, longest); 
00627 
00628         if ( DISPLAY ) {
00629                 cout << "longest id is: " << longest << endl;
00630                 cout << "Printing out merge_hash!" << endl;
00631                 display_hash(merge_hash);
00632         }
00633         merge_supertrees(merge_hash, longest); 
00634 
00635     return;
00636 }
00637 
00638 #endif

Generated on Wed Jul 26 22:18:14 2006 for RaqApproach by  doxygen 1.4.7