#!/usr/bin/perl -w package Vector; require Exporter; use strict; our (@ISA, @EXPORT); @ISA = qw(Exporter); @EXPORT = qw(min max sum sum2 sumProduct mean var sd corr within readDelim readDelimList readXLSAttrs readXLSList uniq); { sub min(@) { die if scalar @_ == 0; my $bestIndex = 0; foreach my $index (1..$#_) { $bestIndex = $index if $_[$index] < $_[$bestIndex]; } return $_[$bestIndex]; } sub max(@) { die if scalar @_ == 0; my $bestIndex = 0; foreach my $index (1..$#_) { $bestIndex = $index if $_[$index] > $_[$bestIndex]; } return $_[$bestIndex]; } sub sum( @ ) { my @x = @_; my $sum = 0; map { $sum += $_ } @x; return $sum; } sub sum2( @ ) { my @x = @_; my $sum2 = 0; map { $sum2 += $_ * $_ } @x; return $sum2; } sub mean( @ ) { my @x = @_; return 0 if scalar(@x) == 0; return sum(@x)/scalar(@x); } sub var( @ ) { my @x = @_; return 0 if scalar (@x) < 2; my $sumX = sum(@x); return( (sum2(@x)-($sumX*$sumX))/( (scalar(@x)-1)*(scalar(@x)-1)) ); } sub sd( @ ) { my @x = @_; my $varX = var(@x); return ($varX < 1e-8 ? 0 : sqrt($varX)); } sub sumProduct( $$ ) { my ($listRef1,$listRef2) = @_; die "Lists of different lengths" if scalar @{$listRef1} != scalar @{$listRef2}; my $sumXY = 0; } # accepts as input two references to lists sub corr( $$ ) { my ($listRef1,$listRef2) = @_; die "Lists of different lengths" if scalar @{$listRef1} != scalar @{$listRef2}; my $n = scalar @$listRef1; die "Lists too short" if $n < 2; my $sum1 = sum( @$listRef1 ); my $sum1sq = sum2( @$listRef1 ); my $sum2 = sum( @$listRef2 ); my $sum2sq = sum( @$listRef2 ); return ( (sumProduct($listRef1,$listRef2 ) - $sum1*$sum2) / sqrt(($sum1sq-$sum1*$sum1)*($sum2sq-$sum2*$sum2)) ); } sub within(@) { my $test = shift; foreach (@_) { return 1 if $test eq $_; } return 0; } # given a tab-delimited file with a header file, and a column name to index by, # return a hash of column name -> index -> value. sub readDelim($$) { my ($file, $indexCol) = @_; open (READDELIM, '<', $file) || die "Cannot read $file"; $_ = ; die "$file has no header line" if !($_); s/[\r\n]+$//; my @header = split /\t/, $_, -1; # keep empty fields my %names = map { $_ => {} } @header; my %index = map { $header[$_] => $_ } (0..$#header); die "Cannot find index column $indexCol in $file" if !exists $index{$indexCol}; my $iIndex = $index{$indexCol}; while(my $line = ) { $line =~ s/[\r\n]+$//; my @columns = split /\t/, $line, -1; # keep empty fields die "Wrong number of columns in $line from $file" if $#columns != $#header; die "Dup index $columns[$iIndex]" if exists $names{$columns[$iIndex]}; foreach my $iCol (0..$#columns) { $names{$header[$iCol]}{$columns[$iIndex]} = $columns[$iCol]; } } close(READDELIM) || die "Cannot close $file"; return \%names; } # given a tab-delimited file with a header file, return a list of column name -> value. # use instead of readDelim when you want to keep the order in the file sub readDelimList($) { my ($file) = @_; open (READDELIM, '<', $file) || die "Cannot read $file"; $_ = ; die "$file has no header line" if !($_); s/[\r\n]+$//; my @header = split /\t/, $_, -1; # keep empty fields my @list = (); while(my $line = ) { $line =~ s/[\r\n]+$//; my @columns = split /\t/, $line, -1; # keep empty fields die "Wrong number of columns in $line from $file" if $#columns != $#header; my %names = (); foreach my $iCol (0..$#columns) { $names{$header[$iCol]} = $columns[$iCol]; } push(@list, \%names); } close(READDELIM) || die "Cannot close $file"; return \@list; } # read just the attributes from a file with attributes of the form !Name=Value at the top sub readXLSAttrs($) { my ($file) = @_; my %attrs = (); open (READDELIM, '<', $file) || die "Cannot read $file"; while() { s/[\r\n]+$//; if (m/^!/) { if (m/^!([A-Za-z -+_.#\$ 0-9]+)=(.*)$/) { die "Duplicate attr $1" if exists $attrs{$1}; $attrs{$1} = $2; } else { die "No attr in line $_\n"; } } else { last; } } close(READDELIM) || die "Cannot close $file"; return (\%attrs); } # read a tab-delimited file with optional attributes of the form !Name=Value before the header # returns a list of columnName->value and a hash of the attributes sub readXLSList($) { my ($file) = @_; my @header = (); my %attrs = (); open (READDELIM, '<', $file) || die "Cannot read $file"; for(;;) { $_ = ; die "$file has no header line" if !($_); s/[\r\n]+$//; if (m/^!/) { if (m/^!([A-Za-z -+_.#\$ 0-9]+)=(.*)$/) { die "Duplicate attr $1" if exists $attrs{$1}; $attrs{$1} = $2; } else { die "No attr in line $_\n"; } } else { @header = split /\t/, $_, -1; # keep empty fields last; } } my @list = (); while(my $line = ) { $line =~ s/[\r\n+]$//; my @columns = split /\t/, $line, -1; # keep empty fields die "Wrong number of columns in $line from $file" if $#columns != $#header; my %names = (); foreach my $iCol (0..$#columns) { $names{$header[$iCol]} = $columns[$iCol]; } push(@list, \%names); } close(READDELIM) || die "Cannot close $file"; return (\@list, \%attrs); } sub uniq(@) { return() if scalar @_ == 0; my @out = (shift); while (scalar @_ > 0) { my $new = shift; push @out, $new if $new ne $out[scalar(@out)-1]; } return(@out); } } return 1;