matrix.h

00001 
00002 
00007 //(c) 2006 Suzanne Matthews
00008 /*
00009 This file is part of the RaqApproach software package.
00010 
00011     RaqApproach is free software; you can redistribute it and/or modify
00012     it under the terms of the GNU General Public License as published by
00013     the Free Software Foundation; either version 2 of the License, or
00014     (at your option) any later version.
00015 
00016     RaqApproach is distributed in the hope that it will be useful,
00017     but WITHOUT ANY WARRANTY; without even the implied warranty of
00018     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00019     GNU General Public License for more details.
00020 
00021     You should have received a copy of the GNU General Public License
00022     along with RaqApproach; if not, write to the Free Software
00023     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
00024 */
00025 #ifndef MATRIX_H_
00026 #define MATRIX_H_
00027 
00028 #include <math.h>
00029 #include "elements.h"
00030 #include <cstdlib>
00031 #include <cstdio>
00032 #include "common.h"
00033 #include <algorithm>
00034 
00035 using namespace std;
00036 
00037 int find_length(string one, string two); 
00038 double calculateDistance(string first, string last, int dist_type, const int penalty); 
00039 double uncorrected(string one, string two, const int penalty); 
00040 double jukes_cantor(string one, string two, const int penalty);
00041 double kimura(string one, string two);
00042 int calculateMax(int max_score); 
00043 double calculateScore(string first, string second); 
00044 bool is_transition(char achar, char bchar); 
00045 
00046 
00047 void init_unknowns()
00048 {
00049         unknown_chars.push_back('S');
00050         unknown_chars.push_back('W');
00051         unknown_chars.push_back('H');
00052         unknown_chars.push_back('B');
00053         unknown_chars.push_back('V');
00054         unknown_chars.push_back('D');
00055         unknown_chars.push_back('N');
00056         unknown_chars.push_back('?');
00057 
00058 }
00059 
00060 void preprocess(string & newstring ) { 
00061         for ( unsigned int i = 0; i < newstring.size(); ++i ) {
00062                 char temp = toupper(newstring[i]);
00063                 if ( (temp != 'A') &&
00064                      (temp != 'G') &&
00065                      (temp != 'C') &&
00066                      (temp != 'T') &&
00067                      (temp != 'R') &&
00068                      (temp != 'Y') &&
00069                      (temp != '-') )
00070                      newstring[i] = unknown_char; 
00071         }
00072 }
00073 multimap<double, string> create_matrix(int dist_type, const int penalty) {
00074 
00076   ifstream fin;
00077   fin.open("taxa.txt");
00078   if (!fin) { 
00079     cout << "file cannot be opened." << endl;
00080     multimap<double, string> error_matrix;
00081     error_matrix.insert(make_pair(0.0, "error"));
00082     return error_matrix;
00083   }
00084 
00086   string newstring;
00087   string numtaxa_string;
00088   multimap<double, string> matrix;
00089   list<string> taxa;
00090   cout << "preprocessing taxa file..." << endl;
00091   fin >> numtaxa_string;
00092   stringstream sst;
00093   sst << numtaxa_string;
00094   sst >> num_taxa;
00095   cout << "num_taxa is: " << num_taxa << endl;
00096   while ( !fin.eof() ){ 
00097     fin >> newstring;
00098     preprocess(newstring); 
00099     taxa.push_back(newstring); 
00100   }
00101   cout << "done." << endl;
00102 
00103   fin.close(); 
00104   list<string>::iterator istr = taxa.begin();
00105   if ( DISPLAY ) {
00106           cout << "outputting processed taxa:" << endl;
00107           for (istr; istr != taxa.end(); ++istr) { 
00108                 cout << (*istr) << endl;
00109           }
00110   }
00111 
00112   istr = taxa.begin();
00113   list<string>::iterator istr2 = taxa.begin();
00114   cout << "calculating distance matrix..." << endl;
00115   float dist = 0.0;
00116   int i,j;
00117   i = j = 1;
00118   for ( istr; istr !=  taxa.end(); ++istr) {
00119     istr2 = istr;
00120     ++istr2;
00121     for ( istr2; istr2 != taxa.end(); ++istr2) {
00122       ++j;
00123       dist = calculateDistance((*istr), (*istr2), dist_type, penalty); 
00124       if ( DISPLAY ) {
00125                   cout << "distance between taxa " << i << " and " << j;
00126                   cout << " is: " << dist << endl; 
00127           }
00128       stringstream ss, ss2;
00129       string str;
00130       string str2;
00131       ss << i;
00132       ss >> str;
00133       ss2 << j;
00134       ss2 >> str2;
00135       string tag = str + "." + str2; 
00136       matrix.insert( make_pair(dist, tag) ); 
00137     }
00138     ++i;
00139     j = i;
00140   }
00141   cout << "distance matrix creation complete." << endl;
00142   if ( DISPLAY ) {
00143           multimap<double, string>::iterator m_itr = matrix.begin();
00144           while ( m_itr != matrix.end() ) {
00145                 cout << m_itr ->first << " " << m_itr->second << endl; 
00146                 ++m_itr;
00147           }
00148   }
00149   return matrix;
00150 
00151 }
00152 
00153 int calculateMax(int max_score) { 
00154   int newmax = 0;
00155   newmax = max_score; 
00156   return newmax;
00157 }
00158 
00159 double calculateDistance(string one, string two, int dist_type, const int penalty) { 
00160 
00161   double distance = 0;
00162 
00163   switch(dist_type) {
00164   case 1: distance = uncorrected(one, two, penalty); 
00165     break;
00166   case 2: distance = jukes_cantor(one, two, penalty); 
00167     break;
00168   case 3: distance = kimura(one, two); 
00169     break;
00170   case 4: distance = calculateScore(one, two); 
00171     break;
00172   default: cout << "invalid option for distance!" << endl;
00173   }
00174 
00175   return distance;
00176 }
00177 
00178 int find_length(string one, string two) { 
00179 
00180   int length = 0;
00181   if ( one.size() < two.size() )
00182     length = one.size();
00183   else
00184     length = two.size();
00185 
00186   return length;
00187 }
00188 
00189 double uncorrected(string one,string two, const int gapp) { 
00190 
00191 
00192   double distance = 0.0;
00193   int matches = 0;
00194   int positions_scored = find_length(one, two);
00195 
00196   int gaps = 0;
00197 
00198   for ( unsigned int i = 0; i < positions_scored; ++i ) {
00199     if ( (one[i] == two[i]) && (one[i]!= unknown_char) ) {
00200       matches++;
00201     }
00202     else {
00203       if ( one[i] == gap_char || two[i] == gap_char )
00204                 gaps++;
00205     }
00206   }
00207 
00208   double temp = gaps*gapp; 
00209   temp = temp + positions_scored;
00210   double S = matches / temp; 
00211   distance = 1 - S; 
00212   return distance;
00213 }
00214 
00215 double jukes_cantor(string one, string two, const int gapp) { 
00216   double distance = 0.0;
00217   const float b = .75; 
00218 
00219   double D = uncorrected(one, two, gapp); 
00220   if ( D >= b) 
00221     return D;
00222   D = (double)D/(double)b; 
00223   D = 1 - D;
00224   distance = log(D); 
00225   distance = distance*b;
00226   distance = distance*-1; 
00227 
00228   return distance;
00229 }
00230 
00231 bool matches_unknowns(char a_char) { 
00232         vector<char>::iterator v_itr = unknown_chars.begin();
00233         v_itr = find(unknown_chars.begin(), unknown_chars.end(), a_char);
00234         if ( v_itr != unknown_chars.end() )
00235                 return true;
00236         return false;
00237 }
00238 
00239 double kimura(string one, string two) { 
00240   int positions_scored = find_length(one, two);
00241   int transitions = 0;
00242   int transversions = 0;
00243   double distance = 0.0;
00244 
00245   for ( unsigned int i = 0; i < positions_scored; ++i) {
00246     if ( matches_unknowns(one[i]) || one[i]== gap_char ) 
00247       continue;
00248     else if ( matches_unknowns(two[i]) || two[i] == gap_char ) 
00249       continue;
00250     else if ( is_transition(one[i],two[i]) ) 
00251       ++transitions; 
00252     else
00253       ++transversions; 
00254   }
00255 
00256   double P = (double)transitions / (double)positions_scored; 
00257   double Q = (double)transversions / (double)positions_scored; 
00258 
00259   double temp =  2*Q; 
00260   temp = 1 - temp; 
00261   temp = sqrt(temp); 
00262   double newtemp = 2*P; 
00263   newtemp = newtemp - Q; 
00264   newtemp = 1 - newtemp; 
00265   temp = newtemp*temp; 
00266   temp = log(temp); 
00267   distance = -.5 * temp; 
00268 
00269   return distance;
00270 }
00271 
00272 
00273 double calculateScore(string one, string two) { 
00274   double score=0.0;
00277   //**********************************
00278   const int match_score = 1;
00279   const float alpha = .5;
00280   const float beta = .25;
00281   //*********************************
00282   init_unknowns();
00283   bool penalize_end_gaps = true; 
00284 
00285   int max_score = 0;
00286 
00287   max_score = find_length(one, two);
00288   max_score = calculateMax(max_score); 
00289 
00290   int gapscore;
00291   if ( penalize_end_gaps == true ) 
00292     gapscore = -1;
00293   else
00294     gapscore = 0;
00295 
00296   unsigned int i = one.size()-1;
00297   unsigned int j = two.size()-1;
00298 
00299   for (unsigned int x = 0; x < one.size(); ++x) { 
00300     if ( (one[x] == two[x]) ) { 
00301       if ( ( one[x] != gap_char) )
00302         score++; 
00303       else
00304         score = score + 0; 
00305     }
00306     else {
00307       if (one[x] == gap_char || two[x] == gap_char) { 
00308                 score--;
00309       }
00310       else if ( (one[x] == unknown_char) || (two[x] == unknown_char) )
00311         score = score + 0;
00312       else if (is_transition(one[x], two[x]) ) { 
00313                 score = score+alpha; 
00314       }
00315       else {
00316                 score = score+beta; 
00317       }
00318     }
00319   }
00320   double distance = (max_score - score)/max_score; 
00321   return distance;
00322 }
00323 
00324 bool is_transition( char achar, char bchar ) { 
00325 
00326   /*Note: A-G and C-T are transitions. All else are transversions*/
00327 
00328   char newa = toupper(achar);
00329   char newb = toupper(bchar);
00330 
00331   switch ( newa ) {
00332 
00333   case 'A': {
00334     if ( ( newb == 'G') || (newb == 'R') )
00335       return true;
00336     else if (newb == 'A')
00337       return true;
00338     else return false;
00339   }
00340     break;
00341   case 'G': {
00342     if ( (newb == 'A') || ( newb == 'R') )
00343       return true;
00344     else if ( newb == 'G')
00345       return true;
00346     else return false;
00347   }
00348     break;
00349   case 'C': {
00350     if ( ( newb == 'T') || ( newb == 'Y') )
00351       return true;
00352     else if ( newb =='C')
00353       return true;
00354     else return false;
00355   }
00356     break;
00357   case 'T': {
00358     if ( (newb == 'C') || (newb == 'Y') )
00359       return true;
00360     else if ( newb == 'T')
00361       return true;
00362     else return false;
00363   }
00364     break;
00365   case 'R': {
00366         if ( (newb == 'A') || (newb == 'G') )
00367           return true;
00368         else if ( newb == 'R')
00369           return true;
00370         else return false;
00371   }
00372     break;
00373   case 'Y': {
00374         if ( (newb == 'C') || (newb == 'T') )
00375           return true;
00376         else if ( newb == 'Y')
00377           return true;
00378         else return false;
00379   }
00380     break;
00381   default: cout << "ERROR! Not valid DNA sequence!: " << newa << endl;
00382   }
00383 }
00384 
00385 #endif

Generated on Wed Jul 26 22:18:14 2006 for RaqApproach by  doxygen 1.4.7