#!/usr/local/bin/perl -w # express.pl # analyze gene expression data # copyright (c) 1999 joel s. bader use strict; use lib "/home/jsbader/perl/lib/site_perl"; use PDL; print STDERR "$0 begins\n"; # check whether we're supposed to take the transpose # and whether we're supposed to compute the principal factors my $errcnt = 0; my $transpose = 0; my $pca = 0; if ($#ARGV > 0) { $errcnt++; } if ($#ARGV == 0) { $transpose = ($ARGV[0] =~ /t/); $pca = ($ARGV[0] =~ /p/); $errcnt++ if ( $ARGV[0] =~ /[^tp]/ ); } if ( $errcnt ) { print "usage: $0 [tp]\n" . " t use transpose (columns)\n" . " p get principal components\n"; die; } print STDERR "reading data...\n"; # read a matrix from stdin my @matrixIn = ; my $headLine = shift(@matrixIn); # gets the column names my @colName = split(/\s+/, $headLine); # split on whitespace shift(@colName); # first column is placeholder my $ncol = scalar(@colName); my $nrow = scalar(@matrixIn); # create a pdl my $data = PDL->zeroes($nrow,$ncol); # grab the rows my @rowName = (); for (my $i = 0; $i < $nrow; $i++) { my @matrixRow = split(/\s+/, $matrixIn[$i]); $rowName[$i] = shift(@matrixRow); for (my $j = 0; $j < $ncol; $j++) { set($data, $i, $j, $matrixRow[$j]); } } if ($transpose) { print STDERR "getting transpose ...\n"; $data = transpose($data); ($nrow, $ncol) = ($ncol, $nrow); # ultra-cool swap my @tmp = @rowName; @rowName = @colName; @colName = @tmp; } print STDERR "getting correlation matrix ...\n"; # make a correlation matrix my $corln = PDL->zeroes($nrow,$nrow); for (my $i = 0; $i < $nrow; $i++) { for (my $j = 0; $j <= $i; $j++) { my $x = $data->slice("($i),:"); # row i my $y = $data->slice("($j),:"); # row j my $sumx = sum($x); my $sumy = sum($y); my $sumxx = inner($x,$x); my $sumxy = inner($x,$y); my $sumyy = inner($y,$y); my $value = ($sumxy - $sumx * $sumy / $ncol) / sqrt ( abs ( ($sumxx - $sumx * $sumx / $ncol) *($sumyy - $sumy * $sumy / $ncol) ) ); set($corln, $i, $j, $value); set($corln, $j, $i, $value); } } # convert the correlation to a distance print STDERR "getting distance ...\n"; my $distance = sqrt(2. - 2. * $corln); open(DISTANCE,">infile") or die("Could not open infile\n"); print DISTANCE "$nrow\n"; for (my $i = 0; $i < $nrow; $i++) { print DISTANCE "$rowName[$i]\t"; for (my $j = 0; $j < $nrow; $j++) { print DISTANCE at($distance,$i,$j) . "\t"; } print DISTANCE "\n"; } close DISTANCE; if ($pca) { print STDERR "centering correlation matrix ...\n"; my $c1 = average($corln); $corln -= $c1->dummy(0,$nrow); $c1 = average($corln->xchg(0,1)); $corln -= $c1->dummy(1,$nrow); my ($e, $lambda) = PDL::Math::eigens($corln); my $ix = qsorti($lambda); # get sorted index open(PCA,">pca.txt") or die("Could not open pca.txt\n"); print PCA "component:\t"; for (my $i = $nrow-1; $i >= 0; $i--) { print PCA at($lambda,at($ix,$i)) . "\t"; } print PCA "\n"; for (my $i = 0; $i < $nrow; $i++) { print PCA $rowName[$i] . "\t"; for (my $j = $nrow-1; $j >= 0; $j--) { print PCA at($e,$i,at($ix,$j)) . "\t"; } print PCA "\n"; } close(PCA); } print STDERR "normal exit\n"; exit(0);