diff --git a/Bed.pm b/Bed.pm index 7a6cc46..715deed 100644 --- a/Bed.pm +++ b/Bed.pm @@ -113,7 +113,7 @@ sub readBedFile my $i = 0; while (my $bedLine = readNextBedLine ($fin)) { - print "$i ...\n" if $verbose && $i % 100000 == 0; + print STDERR "$i ...\n" if $verbose && $i % 100000 == 0; $i++; push @ret, $bedLine; } @@ -489,7 +489,7 @@ sub splitBedFileByChrom $i++; - print "$i ...\n" if $i % 100000 == 0 && $verbose; + print STDERR "$i ...\n" if $i % 100000 == 0 && $verbose; $line =~/(\S+)\s/; @@ -528,7 +528,7 @@ sub splitBedFileByChrom if ($sort) { - print "sorting $chrom ...\n" if $verbose; + print STDERR "sorting $chrom ...\n" if $verbose; my $f = $tagCount{$chrom}->{'f'}; my $f2 = "$f.sort"; diff --git a/Common.pm b/Common.pm index 0187eab..489f3fc 100644 --- a/Common.pm +++ b/Common.pm @@ -35,6 +35,8 @@ our $VERSION = 1.01; chop_carriage clean_rep cnk + cnkln + count2prob dbinom diffArray entropy @@ -42,6 +44,8 @@ our $VERSION = 1.01; getFullPath getTaxId gscore + hypergeoTest + hypergeo intersectArray list_to_rep locateArrayElem @@ -527,7 +531,7 @@ sub stdev my $m = mean ($array); - map {$sum+= ($_-$m) ^2} @$array; + map {$sum+= ($_-$m) **2} @$array; my $n = @$array; return sqrt ($sum/($n-1)); @@ -552,7 +556,7 @@ sub norm my $array = $_[0]; my $sum = 0; - map {$sum+= $_ ^2} @$array; + map {$sum+= $_ **2} @$array; #my $elem; #foreach $elem (@$array) @@ -563,6 +567,20 @@ sub norm return $sum; } +sub count2prob +{ + die "count2prob: incorrect number of parameters.\n" if @_ != 1 && @_!= 2; + my ($array, $pseudoCount) = @_; + $pseudoCount = 0 unless $pseudoCount; + + my $s = sum ($array); + my $n = @$array; + + my @ret = map{($_+$pseudoCount)/($s+$pseudoCount * $n)} @$array; + return \@ret; +} + + sub max { @@ -905,6 +923,53 @@ sub dbinom } +#hypergeometric probability + +sub hypergeoTest +{ + my ($m, $k, $N, $n) = @_; + + my $pval = 0; + for (my $i = $k; $i <= Common::min($n, $m); $i++) + { + my $p = hypergeo ($m, $i, $N, $n); + #print "hypergeo ($m, $i, $N, $n) = $p ...\n"; + $pval += $p; + } + return $pval; +} + +sub hypergeo +{ + my ($m, $k, $N, $n) = @_; + my $logP = cnkln($m, $k) + cnkln ($N-$m, $n-$k) - cnkln ($N, $n); + return exp($logP); +} + +{ + my @cache; + $cache[0] = 0; + $cache[1] = 0; + + sub factorln + { + my $n = shift; + return $cache[$n] if defined $cache[$n]; + return undef if $n < 0; + for (my $i = scalar(@cache); $i <= $n; $i++) + { + $cache[$i] = $cache[$i-1] + log($i); + } + return $cache[$n]; + } +} + +sub cnkln +{ + my ($n, $k) = @_; + return factorln($n) - factorln($k) - factorln($n-$k); +} + #/////////////////////////Clustering//////////////////////////////// sub matrix2clusters diff --git a/Motif.pm b/Motif.pm index 8a84f74..0e8fcaf 100644 --- a/Motif.pm +++ b/Motif.pm @@ -34,6 +34,7 @@ our $VERSION = 1.01; getMatrixScore getMaxMatrixScore getMinMatrixScore + maskWord printMotif readMotifFile readMatchFile @@ -743,7 +744,7 @@ sub maskWord sub countMismatch { - my ($w1, $w2, $ignoreCase) = @_; + my ($w1, $w2, $ignoreCase, $maxDiff) = @_; $w1 =~tr/a-z/A-Z/ if $ignoreCase; $w2 =~tr/a-z/A-Z/ if $ignoreCase; @@ -753,16 +754,25 @@ sub countMismatch my @w1 = split (//, $w1); my @w2 = split (//, $w2); - return countMismatchBase (\@w1, \@w2); + return countMismatchBase (\@w1, \@w2, $maxDiff); } #we assume $w1 and $w2 has the same length +# +# my $n = countMismatchBase (\@w1, \@w2, $maxDiff); + sub countMismatchBase { - my ($w1, $w2) = @_; + my ($w1, $w2, $maxDiff) = @_; my $n = 0; my $l = @$w1; - map {$n++ if $w1->[$_] ne $w2->[$_]} (0 .. ($l-1)); + $maxDiff = $l unless $maxDiff; + + for (my $i = 0; $i < $l; $i++) + { + $n++ if $w1->[$i] ne $w2->[$i]; + last if $n >= $maxDiff; + } return $n; } diff --git a/MyConfig.pm b/MyConfig.pm index 22b22e7..92849fa 100644 --- a/MyConfig.pm +++ b/MyConfig.pm @@ -80,6 +80,10 @@ sub getGenomeDir return "$genomeDir/hg19" if ($species eq 'hg19'); return "$genomeDir/panTro3" if ($species eq 'panTro3'); return "$genomeDir/panTro4" if ($species eq 'panTro4'); + return "$genomeDir/papAnu2" if ($species eq 'papAnu2'); + return "$genomeDir/calJac3" if ($species eq 'calJac3'); + return "$genomeDir/saiBol1" if ($species eq 'saiBol1'); + return "$genomeDir/micMur1" if ($species eq 'micMur1'); return "$genomeDir/ponAbe2" if ($species eq 'ponAbe2'); return "$genomeDir/gorGor3" if ($species eq 'gorGor3'); return "$genomeDir/rheMac2" if ($species eq 'rheMac2'); @@ -95,6 +99,7 @@ sub getGenomeDir return "$genomeDir/dm2" if ($species eq 'dm2'); return "$genomeDir/ce2" if ($species eq 'ce2'); return "$genomeDir/sacCer1" if ($species eq 'sacCer1'); + return "$genomeDir/danRer10" if ($species eq 'danRer10'); Carp::croak "can not find the directory for $species\n"; } diff --git a/Quantas.pm b/Quantas.pm index b164f76..e5dca76 100644 --- a/Quantas.pm +++ b/Quantas.pm @@ -24,6 +24,8 @@ our $VERSION = 1.01; @EXPORT = qw ( readConfigFile + readASConfigFile + readExprConfigFile readExprDataFile readASDataFile readBed6DataFile @@ -48,6 +50,11 @@ use Carp; use Common; +=head2 +readConfigFile - obsolete now + +=cut + sub readConfigFile { my ($configFile, $base, $type) = @_; @@ -78,6 +85,63 @@ sub readConfigFile return \%groups; } +sub readASConfigFile +{ + my ($configFile, $base, $type) = @_; + my $fin; + open ($fin, "<$configFile") || Carp::croak "cannot open file $configFile to read\n"; + my $i = 0; + my %groups; + + while (my $line = <$fin>) + { + chomp $line; + next if $line=~/^\s*$/; + next if $line=~/^\#/; + my ($sampleName, $groupName) = split (/\t/, $line); + $groups{$groupName}->{"id"} = $i++ unless exists $groups{$groupName}; + push @{$groups{$groupName}->{"samples"}}, $sampleName; + + #check whether input file exists + my $inputFile = $base ne '' ? "$base/$sampleName" : $sampleName; + if (-d $inputFile) + { + Carp::croak "undefined type?\n" unless $type; + $inputFile = "$inputFile/$type.count.txt"; + } + Carp::croak "Input file $inputFile does not exist\n" unless -f $inputFile; + } + close ($fin); + return \%groups; +} + +sub readExprConfigFile +{ + my ($configFile, $base, $suffix) = @_; + my $fin; + open ($fin, "<$configFile") || Carp::croak "cannot open file $configFile to read\n"; + my $i = 0; + my %groups; + + while (my $line = <$fin>) + { + chomp $line; + next if $line=~/^\s*$/; + next if $line=~/^\#/; + my ($sampleName, $groupName) = split (/\t/, $line); + + $sampleName .= $suffix if $suffix; + + $groups{$groupName}->{"id"} = $i++ unless exists $groups{$groupName}; + push @{$groups{$groupName}->{"samples"}}, $sampleName; + + #check whether input file exists + my $inputFile = $base ne '' ? "$base/$sampleName" : $sampleName; + Carp::croak "Input file $inputFile does not exist\n" unless -f $inputFile; + } + close ($fin); + return \%groups; +} =head2 readExprDataFile diff --git a/Sam.pm b/Sam.pm index c343361..4b29b72 100644 --- a/Sam.pm +++ b/Sam.pm @@ -24,6 +24,7 @@ our $VERSION = 1.01; @EXPORT = qw ( lineToSam + decodeSamFlag readSamFile samToLine samToBed diff --git a/Sequence.pm b/Sequence.pm index bb0a7bd..a87074f 100644 --- a/Sequence.pm +++ b/Sequence.pm @@ -16,13 +16,15 @@ package Sequence; +use Bio::SeqIO; + use Common; use Bed; require Exporter; -our $VERSION = 1.01; +our $VERSION = 1.02; @ISA = qw (Exporter); @@ -222,25 +224,46 @@ sub writeFastaSeq =head2 wordcount -count the number of all n-mer in a DNA/RNA sequence (case insensitive, i.e. sequeuence will be converted into upper case) +count the number of all n-mer in one or more DNA/RNA sequences (case insensitive, i.e. sequeuence will be converted into upper case) my $wordHash = wordcount ($seqStr, $wordSize) +my $wordHash = wordcount ($seqStr, $wordSize, {wordHash=>$oldWordHash) +my $wordHash = wordcount ($inFastaFile, $wordSize, {fasta=>1}) =cut sub wordcount { - my ($seqStr, $wordSize) = @_; - my $seqLen = length ($seqStr); - $seqStr = uc ($seqStr); - my %wordHash; - for (my $i = 0; $i < length ($seqStr) - $wordSize + 1; $i++) + my ($seqStr, $wordSize, $opt) = @_; + + my $isFasta = exists $opt->{'fasta'} ? $opt->{'fasta'} : 0; + my $wordHash = exists $opt->{'wordHash'} ? $opt->{'wordHash'} : {}; + my $noMask = exists $opt->{'noMask'} ? $opt->{'noMask'} : 0; + + if ($isFasta) { - my $w = substr ($seqStr, $i, $wordSize); - next if $w=~/[^ACGT]/; - $wordHash{$w}++; + my $inFile = $seqStr; + my $seqIO = Bio::SeqIO->new (-file =>$inFile, -format => 'Fasta'); + while (my $seq = $seqIO->next_seq()) + { + $wordHash = wordcount ($seq->seq(), $wordSize, {wordHash=>$wordHash, noMask=>$noMask}); + } + } + else + { + my $seqLen = length ($seqStr); + $seqStr = uc ($seqStr); + for (my $i = 0; $i < length ($seqStr) - $wordSize + 1; $i++) + { + my $w = substr ($seqStr, $i, $wordSize); + if ($noMask == 0) + { + next if $w=~/[^ACGTU]/; + } + $wordHash->{$w}++; + } } - return \%wordHash; + return $wordHash; } @@ -543,7 +566,11 @@ sub bedToSeq { my $blockStart = $bed->{'chromStart'} + $bed->{'blockStarts'}->[$i]; my $blockSeqStr = substr ($chromSeq->{"seq"}, $blockStart, $bed->{'blockSizes'}->[$i]); - Carp::croak "incorrect size for transcript $name, block $i\n" if length ($blockSeqStr) != $bed->{'blockSizes'}->[$i]; + if (length ($blockSeqStr) != $bed->{'blockSizes'}->[$i]) + { + warn "incorrect size for transcript $name, block $i\n"; + return ""; + } $seqStr .= $blockSeqStr; } diff --git a/ViennaRNA.pm b/ViennaRNA.pm index 02e0c32..6a55beb 100644 --- a/ViennaRNA.pm +++ b/ViennaRNA.pm @@ -93,7 +93,7 @@ sub readRNAduplexFile { next if $line=~/^\s*$/; next if $line=~/^\>/; - $line =~/^(\S*?)\s+(\d+)\,(\d+)\s+\:\s+(\d+)\,(\d+)\s+\(\s*(\S*?)\)$/; + $line =~/^(\S*?)\s+(\d+)\,(\d+)\s+\:\s+(\d+)\,(\d+)\s+\(\s*(\S*?)\).*$/; my %entry = (struct=>$1, targetStart=>$2, targetEnd=>$3,