show form and die if (!$_POST){print_form(); die();} // something has been posted -> go ahead set_time_limit (10000); error_reporting(0); // to show computing period, $timestart=date("U"); // GET SEQUENCES // get data from form $allsequences=$_POST["seq"]; // set a limit for maximum length of data posted if (strlen($allsequences)>2000000){ die("Error: this service does not handle input requests longer than 2,000,000 bp."); } //remove a couple of things from sequence $allsequences=substr($allsequences,strpos($allsequences,">")); // whatever is before ">", which is the start of the first sequence $allsequences=preg_replace("/\r/","",$allsequences); // remove carriage returns ("\r"), but do not remove line feeds ("\n") // split the individual sequences into $seqs array $seqs=preg_split("/>/",$allsequences,-1,PREG_SPLIT_NO_EMPTY); // get the name of each sequence (save names to array $seq_name) foreach ($seqs as $key => $val){ $seq_name[$key]=substr($val,0,strpos($val,"\n")); $temp_val=substr($val,strpos($val,"\n")); $temp_val=preg_replace("/\W|\d/","",$temp_val); $seqs[$key]=strtoupper($temp_val); } // at this moment two arrays are available: $seqs (with sequences) and $seq_names (with name of sequences) // COMPUTE // GET OLIGONUCLEOTIDE FREQUENCIES foreach($seqs as $key => $val){ // to compute oligonucleotide frequencies, both strands are used $seq_and_revseq=$val." ".RevComp($val); $oligo_array[$key]= oligo_frequencies($seq_and_revseq,$_POST["len"]); } // COMPUTE DISTANCES if ($_POST["method"]=="pearson"){ // COMPUTE PEARSON DISTANCES AMONG SEQUENCES foreach($seqs as $key => $val){ foreach($seqs as $key2 => $val2){ if ($key>=$key2){continue;} $data[$key][$key2]= Pearson_distance($oligo_array[$key],$oligo_array[$key2]); } } }else{ // COMPUTE GENOMIC SIGNATURE DISTANCES AMONG SEQUENCES foreach($seqs as $key => $val){ foreach($seqs as $key2 => $val2){ if ($key>=$key2){continue;} $data[$key][$key2]= GS_distance($oligo_array[$key],$oligo_array[$key2],$_POST["len"]); } } } // DISPLAY TABLE WITH DISTANCES print "\n"; for($i=1;$i$i"; } print "\n"; foreach($data as $key => $val){ print ""; for($i=1;$i".$data[$key][$i].""; }else{ print ""; } } print "\n"; } print "
Distances
$key 
\n"; // DISPLAY NAME OF SEQUENCES print "
\n

Name of sequences\n

\n";
        foreach($seq_name as $n => $name){
                print "  $n\t$name\n";
        }
        print "
\n"; // DISPLAY SELECTED OLIGONUCLEOTIDE LENGTH print "Length of oligoncleotides for comparison: ".$_POST["len"]; // NEXT LINES WILL PERFORM UPGMA CLUSTERING // in each loop, array $data is reduced (one case per loop) while (sizeof($data)>1){ $min=min_array($data); // global variables are created: $x, $y, $min, $cases $comp[$x][$y]=$min; $data=new_array($data,$cases,$x,$y); } $min=min_array($data); $comp[$x][$y]=$min; // end of clustering // array $comp stores the important data // $textcluster is the results of the cluster as text. // p.e.: ((3,4),7),(((5,6),1),2) $textcluster="$x,$y"; print "

Clustering method: UPGMA
$textcluster

"; // $max is the distance in the last clustering step (the root of the dendrogram) $max=$a[$x][$y]; // CREATE THE IMAGE WITH THE DENDROGRAM create_dendrogram ($textcluster,$comp,$max,$_POST["method"],$_POST["len"]); // SHOW DENDROGRAM print ""; // SHOW TIME REQUIRED FOR COMPUTING $timetotal=date("U")-$timestart; print "

Computed in $timetotal seconds
"; // ####################################################################################### // ############################## FUNCTIONS #################################### // ####################################################################################### function get_cases($a){ $done=""; foreach($a as $key => $val){ $done.="#$key"; foreach($a[$key] as $key2 =>$val2){ $done.="#$key2"; } } $cases=preg_split("/#/",$done,-1,PREG_SPLIT_NO_EMPTY); $cases=array_unique($cases); sort($cases); return $cases; } //####################################################################################### function new_array($a,$cases,$x,$y){ $cases=get_cases($a); for($j=0; $j$val){ $str_cases.="#$key"; foreach($a[$key] as $key2 =>$val2){ if ($val==""){continue;} $str_cases.="#$key2"; if ($val2<$min){ $min=$val2; $x=$key; $y=$key2; } } } $cases=preg_split("/#/",$done,-1,PREG_SPLIT_NO_EMPTY); $cases=array_unique($cases); sort($cases); $min2=$min/2; return $min; } //####################################################################################### function create_dendrogram($str,$comp,$max,$method,$len){ $w=20; //height for each line (case) $str=preg_replace("/\(|\)/","",$str).","; $a=preg_split("/,/",$str,-1,PREG_SPLIT_NO_EMPTY); $rows=sizeof($a); $width=600; // width of scale from 0 to 2 $im = @imagecreatetruecolor($width*1.2, $rows*$w+40) or die("Unable to start image. It is GD available?"); $white =imagecolorallocate($im, 255, 255, 255); $black =imagecolorallocate($im, 0, 0, 0); $red =imagecolorallocate($im, 255, 0, 0); imagefilledrectangle($im,0,0,$width*1.2, $rows*$w+40,$white); $y=$rows*$w; // vertical location $f=$width; // multiplication factor // lines for scale $j=0.1; imageline ($im, log($j+1)*$f+20, $y, log($j+1)*$f+20, $y+10, $black); imagestring($im, 1, log($j+1)*$f-8+20, $y+12, $j, $black); $j=0.2; imageline ($im, log($j+1)*$f+20, $y, log($j+1)*$f+20, $y+10, $black); imagestring($im, 1, log($j+1)*$f-8+20, $y+12, $j, $black); $j=0.3; imageline ($im, log($j+1)*$f+20, $y, log($j+1)*$f+20, $y+10, $black); imagestring($im, 1, log($j+1)*$f-8+20, $y+12, $j, $black); $j=0.5; imageline ($im, log($j+1)*$f+20, $y, log($j+1)*$f+20, $y+10, $black); imagestring($im, 1, log($j+1)*$f-8+20, $y+12, $j, $black); $j=1.0; imageline ($im, log($j+1)*$f+20, $y, log($j+1)*$f+20, $y+10, $black); imagestring($im, 1, log($j+1)*$f-8+20, $y+12, "1.0", $black); $j=1.5; imageline ($im, log($j+1)*$f+20, $y, log($j+1)*$f+20, $y+10, $black); imagestring($im, 1, log($j+1)*$f-8+20, $y+12, $j, $black); $j=2.0; imageline ($im, log($j+1)*$f+20, $y, log($j+1)*$f+20, $y+10, $black); imagestring($im, 1, log($j+1)*$f-8+20, $y+12, "2.0", $black); // write into the image the numbers corresponding to cases foreach($a as $n => $val){ if (strlen($val)==1){$val=" $val";} imagestring($im,3, 5, $n*$w+5, $val, $black); } // WRITE LINES foreach ($comp as $key => $val){ $pos1=$pos2=0; foreach ($comp[$key] as $key2 => $val2){ // get position of case in the list $keya=preg_replace("/\(|\)/","",$key); $pos1=substr_count (" ,".substr ($str, 0,strpos(" ,".$str,",$keya,")),",")-0.4; $keyb=preg_replace("/\(|\)/","",$key2); $pos2=substr_count (" ,".substr ($str, 0,strpos(" ,".$str,",$keyb,")),",")-0.4; if (substr_count($keya,",")>0){$pos1b=$pos1+substr_count($keya,",")/2;}else{$pos1b=$pos1;} if (substr_count($keyb,",")>0){$pos2b=$pos2+substr_count($keyb,",")/2;}else{$pos2b=$pos2;} // Position related data $xkey1=$wherex[$key]; if ($xkey1==""){$xkey1=0;} $xkey2=$wherex[$key2]; if ($xkey2==""){$xkey2=0;} $max=max($xkey1,$xkey2); $min=min($xkey1,$xkey2); $xmax=$max+(($val2-($max))/2); $val4=log($xmax+1)*$f; $val4max=log($max+1)*$f; $val4min=log($min+1)*$f; // write lines if ($wherex[$key]==$max){ imageline ($im, $val4max+20, $pos1b*$w, $val4+20, $pos1b*$w, $black); // --- imageline ($im, $val4+20, $pos1b*$w, $val4+20, $pos2b*$w, $black); // | imageline ($im, $val4min+20, $pos2b*$w, $val4+20, $pos2b*$w, $black); // --- }else{ imageline ($im, $val4min+20, $pos1b*$w, $val4+20, $pos1b*$w, $black); // --- imageline ($im, $val4+20, $pos1b*$w, $val4+20, $pos2b*$w, $black); // | imageline ($im, $val4max+20, $pos2b*$w, $val4+20, $pos2b*$w, $black); // --- } $wherex["(".$key.",".$key2.")"]=$xmax; } } imageline ($im, $val4+20, ($pos1b+$pos2b)*$w/2, $val4+40, ($pos1b+$pos2b)*$w/2, $black); imageline ($im, 20, $y, $width*1.2, $y, $black); if ($method=="pearson"){ imagestring($im, 2, 5, $rows*$w+25, "Pearson distance for $len bases long oligonucleotides.", $red); }else{ imagestring($im, 2, 5, $rows*$w+25, "Genomic Signature distance for $len bases long oligonucleotides.", $red); } imagestring($im, 2, $width*1, $rows*$w+25, "by gscompare.ehu.eus", black); imagepng($im,"image.png"); imagedestroy($im); } //####################################################################################### function RevComp($code){ $code=strrev($code); $code=str_replace("A", "t", $code); $code=str_replace("T", "a", $code); $code=str_replace("G", "c", $code); $code=str_replace("C", "g", $code); $code = strtoupper ($code); return $code; } //####################################################################################### function Pearson_distance($vals_x,$vals_y){ // $a and $b are arrays if (sizeof($vals_x)!= sizeof($vals_y)){return;} $sum_x=array_sum($vals_x); $sum_y=array_sum($vals_y); $sum_x2=0; $sum_y2=0; $sum_xy=0; $n=sizeof($vals_x); foreach($vals_x as $key => $val){ $val_x=$val; $val_y=$vals_y[$key]; $sum_x2+=$val_x*$val_x; $sum_y2+=$val_y*$val_y; $sum_xy+=$val_x*$val_y; } // calculate regression $regresion=($sum_xy-(1/$n)*$sum_x*$sum_y)/((sqrt($sum_x2-(1/$n)*$sum_x*$sum_x)*(sqrt($sum_y2-(1/$n)*$sum_y*$sum_y)))); if ($regresion>0.999999999){$regresion=1;} // round data return (1-$regresion); } function GS_distance($a,$b,$k){ // $a and $b are arrays and $k is the length of oligonucleotides searches // Wang et al, Gene 2005; 346:173-185 // values within arrays are occurences $sumA=array_sum($a); // contant $sumB=array_sum($b); // contant $sum=0; foreach($a as $key => $v){ $sum+= abs(($v/$sumA)-($b[$key]/$sumB)); } return $sum; } //####################################################################################### function oligo_frequencies($cadena,$len_oligos){ $i=0; $len=strlen($cadena)-$len_oligos+1; while ($i<$len){ $seq=substr($cadena,$i,$len_oligos); $oligos_internos[$seq]++; $i++; } $base_a=array("A","C","G","T"); $base_b=array("A","C","G","T"); $base_c=array("A","C","G","T"); $base_d=array("A","C","G","T"); $base_e=array("A","C","G","T"); $base_f=array("A","C","G","T"); //para oligos de 2 if ($len_oligos==2){ foreach($base_a as $key_a => $val_a){ foreach($base_b as $key_b => $val_b){ if ($oligos_internos[$val_a.$val_b]){ $oligos[$val_a.$val_b] = $oligos_internos[$val_a.$val_b]; }else{ $oligos[$val_a.$val_b] = 0; } }} } //para oligos de 3 if ($len_oligos==3){ foreach($base_a as $key_a => $val_a){ foreach($base_b as $key_b => $val_b){ foreach($base_c as $key_c => $val_c){ if ($oligos_internos[$val_a.$val_b.$val_c]){ $oligos[$val_a.$val_b.$val_c] = $oligos_internos[$val_a.$val_b.$val_c]; }else{ $oligos[$val_a.$val_b.$val_c] = 0; } }}} } //para oligos de 4 if ($len_oligos==4){ foreach($base_a as $key_a => $val_a){ foreach($base_b as $key_b => $val_b){ foreach($base_c as $key_c => $val_c){ foreach($base_d as $key_d => $val_d){ if ($oligos_internos[$val_a.$val_b.$val_c.$val_d]){ $oligos[$val_a.$val_b.$val_c.$val_d] = $oligos_internos[$val_a.$val_b.$val_c.$val_d]; }else{ $oligos[$val_a.$val_b.$val_c.$val_d] = 0; } }}}} } //para oligos de 5 if ($len_oligos==5){ foreach($base_a as $key_a => $val_a){ foreach($base_b as $key_b => $val_b){ foreach($base_c as $key_c => $val_c){ foreach($base_d as $key_d => $val_d){ foreach($base_e as $key_e => $val_e){ if ($oligos_internos[$val_a.$val_b.$val_c.$val_d.$val_e]){ $oligos[$val_a.$val_b.$val_c.$val_d.$val_e] = $oligos_internos[$val_a.$val_b.$val_c.$val_d.$val_e]; }else{ $oligos[$val_a.$val_b.$val_c.$val_d.$val_e] = 0; } }}}}} } //para oligos de 6 if ($len_oligos==6){ foreach($base_a as $key_a => $val_a){ foreach($base_b as $key_b => $val_b){ foreach($base_c as $key_c => $val_c){ foreach($base_d as $key_d => $val_d){ foreach($base_e as $key_e => $val_e){ foreach($base_f as $key_f => $val_f){ if ($oligos_internos[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f]){ $oligos[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f] = $oligos_internos[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f]; }else{ $oligos[$val_a.$val_b.$val_c.$val_d.$val_e.$val_f] = 0; } }}}}}} } return $oligos; } // ####################################################################################### function print_form(){ ?>

Distance between sequences:
comparison of oligonucleotide composition

  • This tool will compute distance between input sequences and UPGMA clustering will be applied.
  • Distances are shown in a table, and a dendrogram is displaied.
  • Check carefully the results. Theirvalue for phylogenetics is limited.
"> Copy your sequences bellow as Fasta (Maximum length: 2 MB)

Choose method to compute distances (info)
for

This is a simplified PHP script extracted from the online tool at gscompare.ehu.eus/.

Source code is available at here