00001
00002
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef MATRIX_H_
00026 #define MATRIX_H_
00027
00028 #include <math.h>
00029 #include "elements.h"
00030 #include <cstdlib>
00031 #include <cstdio>
00032 #include "common.h"
00033 #include <algorithm>
00034
00035 using namespace std;
00036
00037 int find_length(string one, string two);
00038 double calculateDistance(string first, string last, int dist_type, const int penalty);
00039 double uncorrected(string one, string two, const int penalty);
00040 double jukes_cantor(string one, string two, const int penalty);
00041 double kimura(string one, string two);
00042 int calculateMax(int max_score);
00043 double calculateScore(string first, string second);
00044 bool is_transition(char achar, char bchar);
00045
00046
00047 void init_unknowns()
00048 {
00049 unknown_chars.push_back('S');
00050 unknown_chars.push_back('W');
00051 unknown_chars.push_back('H');
00052 unknown_chars.push_back('B');
00053 unknown_chars.push_back('V');
00054 unknown_chars.push_back('D');
00055 unknown_chars.push_back('N');
00056 unknown_chars.push_back('?');
00057
00058 }
00059
00060 void preprocess(string & newstring ) {
00061 for ( unsigned int i = 0; i < newstring.size(); ++i ) {
00062 char temp = toupper(newstring[i]);
00063 if ( (temp != 'A') &&
00064 (temp != 'G') &&
00065 (temp != 'C') &&
00066 (temp != 'T') &&
00067 (temp != 'R') &&
00068 (temp != 'Y') &&
00069 (temp != '-') )
00070 newstring[i] = unknown_char;
00071 }
00072 }
00073 multimap<double, string> create_matrix(int dist_type, const int penalty) {
00074
00076 ifstream fin;
00077 fin.open("taxa.txt");
00078 if (!fin) {
00079 cout << "file cannot be opened." << endl;
00080 multimap<double, string> error_matrix;
00081 error_matrix.insert(make_pair(0.0, "error"));
00082 return error_matrix;
00083 }
00084
00086 string newstring;
00087 string numtaxa_string;
00088 multimap<double, string> matrix;
00089 list<string> taxa;
00090 cout << "preprocessing taxa file..." << endl;
00091 fin >> numtaxa_string;
00092 stringstream sst;
00093 sst << numtaxa_string;
00094 sst >> num_taxa;
00095 cout << "num_taxa is: " << num_taxa << endl;
00096 while ( !fin.eof() ){
00097 fin >> newstring;
00098 preprocess(newstring);
00099 taxa.push_back(newstring);
00100 }
00101 cout << "done." << endl;
00102
00103 fin.close();
00104 list<string>::iterator istr = taxa.begin();
00105 if ( DISPLAY ) {
00106 cout << "outputting processed taxa:" << endl;
00107 for (istr; istr != taxa.end(); ++istr) {
00108 cout << (*istr) << endl;
00109 }
00110 }
00111
00112 istr = taxa.begin();
00113 list<string>::iterator istr2 = taxa.begin();
00114 cout << "calculating distance matrix..." << endl;
00115 float dist = 0.0;
00116 int i,j;
00117 i = j = 1;
00118 for ( istr; istr != taxa.end(); ++istr) {
00119 istr2 = istr;
00120 ++istr2;
00121 for ( istr2; istr2 != taxa.end(); ++istr2) {
00122 ++j;
00123 dist = calculateDistance((*istr), (*istr2), dist_type, penalty);
00124 if ( DISPLAY ) {
00125 cout << "distance between taxa " << i << " and " << j;
00126 cout << " is: " << dist << endl;
00127 }
00128 stringstream ss, ss2;
00129 string str;
00130 string str2;
00131 ss << i;
00132 ss >> str;
00133 ss2 << j;
00134 ss2 >> str2;
00135 string tag = str + "." + str2;
00136 matrix.insert( make_pair(dist, tag) );
00137 }
00138 ++i;
00139 j = i;
00140 }
00141 cout << "distance matrix creation complete." << endl;
00142 if ( DISPLAY ) {
00143 multimap<double, string>::iterator m_itr = matrix.begin();
00144 while ( m_itr != matrix.end() ) {
00145 cout << m_itr ->first << " " << m_itr->second << endl;
00146 ++m_itr;
00147 }
00148 }
00149 return matrix;
00150
00151 }
00152
00153 int calculateMax(int max_score) {
00154 int newmax = 0;
00155 newmax = max_score;
00156 return newmax;
00157 }
00158
00159 double calculateDistance(string one, string two, int dist_type, const int penalty) {
00160
00161 double distance = 0;
00162
00163 switch(dist_type) {
00164 case 1: distance = uncorrected(one, two, penalty);
00165 break;
00166 case 2: distance = jukes_cantor(one, two, penalty);
00167 break;
00168 case 3: distance = kimura(one, two);
00169 break;
00170 case 4: distance = calculateScore(one, two);
00171 break;
00172 default: cout << "invalid option for distance!" << endl;
00173 }
00174
00175 return distance;
00176 }
00177
00178 int find_length(string one, string two) {
00179
00180 int length = 0;
00181 if ( one.size() < two.size() )
00182 length = one.size();
00183 else
00184 length = two.size();
00185
00186 return length;
00187 }
00188
00189 double uncorrected(string one,string two, const int gapp) {
00190
00191
00192 double distance = 0.0;
00193 int matches = 0;
00194 int positions_scored = find_length(one, two);
00195
00196 int gaps = 0;
00197
00198 for ( unsigned int i = 0; i < positions_scored; ++i ) {
00199 if ( (one[i] == two[i]) && (one[i]!= unknown_char) ) {
00200 matches++;
00201 }
00202 else {
00203 if ( one[i] == gap_char || two[i] == gap_char )
00204 gaps++;
00205 }
00206 }
00207
00208 double temp = gaps*gapp;
00209 temp = temp + positions_scored;
00210 double S = matches / temp;
00211 distance = 1 - S;
00212 return distance;
00213 }
00214
00215 double jukes_cantor(string one, string two, const int gapp) {
00216 double distance = 0.0;
00217 const float b = .75;
00218
00219 double D = uncorrected(one, two, gapp);
00220 if ( D >= b)
00221 return D;
00222 D = (double)D/(double)b;
00223 D = 1 - D;
00224 distance = log(D);
00225 distance = distance*b;
00226 distance = distance*-1;
00227
00228 return distance;
00229 }
00230
00231 bool matches_unknowns(char a_char) {
00232 vector<char>::iterator v_itr = unknown_chars.begin();
00233 v_itr = find(unknown_chars.begin(), unknown_chars.end(), a_char);
00234 if ( v_itr != unknown_chars.end() )
00235 return true;
00236 return false;
00237 }
00238
00239 double kimura(string one, string two) {
00240 int positions_scored = find_length(one, two);
00241 int transitions = 0;
00242 int transversions = 0;
00243 double distance = 0.0;
00244
00245 for ( unsigned int i = 0; i < positions_scored; ++i) {
00246 if ( matches_unknowns(one[i]) || one[i]== gap_char )
00247 continue;
00248 else if ( matches_unknowns(two[i]) || two[i] == gap_char )
00249 continue;
00250 else if ( is_transition(one[i],two[i]) )
00251 ++transitions;
00252 else
00253 ++transversions;
00254 }
00255
00256 double P = (double)transitions / (double)positions_scored;
00257 double Q = (double)transversions / (double)positions_scored;
00258
00259 double temp = 2*Q;
00260 temp = 1 - temp;
00261 temp = sqrt(temp);
00262 double newtemp = 2*P;
00263 newtemp = newtemp - Q;
00264 newtemp = 1 - newtemp;
00265 temp = newtemp*temp;
00266 temp = log(temp);
00267 distance = -.5 * temp;
00268
00269 return distance;
00270 }
00271
00272
00273 double calculateScore(string one, string two) {
00274 double score=0.0;
00277
00278 const int match_score = 1;
00279 const float alpha = .5;
00280 const float beta = .25;
00281
00282 init_unknowns();
00283 bool penalize_end_gaps = true;
00284
00285 int max_score = 0;
00286
00287 max_score = find_length(one, two);
00288 max_score = calculateMax(max_score);
00289
00290 int gapscore;
00291 if ( penalize_end_gaps == true )
00292 gapscore = -1;
00293 else
00294 gapscore = 0;
00295
00296 unsigned int i = one.size()-1;
00297 unsigned int j = two.size()-1;
00298
00299 for (unsigned int x = 0; x < one.size(); ++x) {
00300 if ( (one[x] == two[x]) ) {
00301 if ( ( one[x] != gap_char) )
00302 score++;
00303 else
00304 score = score + 0;
00305 }
00306 else {
00307 if (one[x] == gap_char || two[x] == gap_char) {
00308 score--;
00309 }
00310 else if ( (one[x] == unknown_char) || (two[x] == unknown_char) )
00311 score = score + 0;
00312 else if (is_transition(one[x], two[x]) ) {
00313 score = score+alpha;
00314 }
00315 else {
00316 score = score+beta;
00317 }
00318 }
00319 }
00320 double distance = (max_score - score)/max_score;
00321 return distance;
00322 }
00323
00324 bool is_transition( char achar, char bchar ) {
00325
00326
00327
00328 char newa = toupper(achar);
00329 char newb = toupper(bchar);
00330
00331 switch ( newa ) {
00332
00333 case 'A': {
00334 if ( ( newb == 'G') || (newb == 'R') )
00335 return true;
00336 else if (newb == 'A')
00337 return true;
00338 else return false;
00339 }
00340 break;
00341 case 'G': {
00342 if ( (newb == 'A') || ( newb == 'R') )
00343 return true;
00344 else if ( newb == 'G')
00345 return true;
00346 else return false;
00347 }
00348 break;
00349 case 'C': {
00350 if ( ( newb == 'T') || ( newb == 'Y') )
00351 return true;
00352 else if ( newb =='C')
00353 return true;
00354 else return false;
00355 }
00356 break;
00357 case 'T': {
00358 if ( (newb == 'C') || (newb == 'Y') )
00359 return true;
00360 else if ( newb == 'T')
00361 return true;
00362 else return false;
00363 }
00364 break;
00365 case 'R': {
00366 if ( (newb == 'A') || (newb == 'G') )
00367 return true;
00368 else if ( newb == 'R')
00369 return true;
00370 else return false;
00371 }
00372 break;
00373 case 'Y': {
00374 if ( (newb == 'C') || (newb == 'T') )
00375 return true;
00376 else if ( newb == 'Y')
00377 return true;
00378 else return false;
00379 }
00380 break;
00381 default: cout << "ERROR! Not valid DNA sequence!: " << newa << endl;
00382 }
00383 }
00384
00385 #endif