#!/usr/bin/perl # # usage: # ./parselog.pl --logdir=dirname # ./parselog.pl --onefile=filename use strict; use Getopt::Long; use LogLine; package main; my ($logdir, $onefile) = ('.', ''); GetOptions("logdir=s" => \$logdir, "onefile=s" => \$onefile); my @files = $onefile ? ($onefile) : <$logdir/*>; die "No files to use" unless @files; my @lines; my $baba =0; foreach my $file (@files) { open READ, $file; while () { my $line = LogLine->new($_); push @lines, $line if defined $line; #if ( defined $line ) #{ # print $baba++, ":", $line->{text}, "\n"; #} } } #print scalar @lines, " total lines.\n"; #### # # at this point, @ lines contains a bunch of hashes that have theline and its info # #### my $seq_size = $ARGV[0]; my $block_size = $ARGV[1]; my @sequences; my $cseq=0; my $ctok=0; # # this loop takes the lines of text, and turns them into sequences # my $k; # these two lines are for remembering where sequences come from. my $cur_line = 0; my @seq_lines; $seq_lines[0] = 0; foreach $k (@lines) { my $tline = $k->{text}; # get rid of excess puncuation chomp $tline; $tline =~ s/[^A-Za-z0-9_\']/\ /g; # keep only letters $tline =~ s/\ +/\ /g; # kill extra spaces $tline =~ s/^\ //; ##print "$tline\n"; # split the words. my @words = split / /, $tline; # start loading the sequences. # into the current sequence, push the word. # my $j = 0; foreach $j (@words) { # if the its time to start a new sequence. if ($ctok >= $seq_size) { $cseq++; $ctok = 0; # indicate that the new sequence starts with line ... $seq_lines[$cseq]=$cur_line; } #print "-$cseq-$ctok--$j\n"; $sequences[$cseq][$ctok] = $j; $ctok++; } $cur_line++; #remember, this is for figuring stuff out later } # print "++$cseq++", scalar @ { $sequences[$cseq] }, "\n"; # there are now sequences that are $seq_size in length. # the last one is not complete in size. #### # # assign blocks their appropriate sequences # #### my @blocks; my $num_seq = $cseq + 1; #total num of sequences my $base_seq=0; my $cblk=0; #print "there are $num_seq total sequences\n"; while ($base_seq + $block_size <= $num_seq) { #print "current block: $cblk\t$base_seq --> "; my $seq = 0; # for the sequences that should be in this block. # # the current blocks sequences get set to the appropriate range # of sequences (via references) of the sequences array. my $h = 0; for ($h = $base_seq; $h < $base_seq + $block_size; $h++) { # the current blocks seq num is set to a ref $blocks[$cblk][$seq++] = \$sequences[$h]; } #print $h-1,"\n"; # move to the next block and also change the sequence range. $cblk++; $base_seq++; } my $num_blocks = $cblk; #test out the above.. make sure we can get to the words. # # at the same time, get word counts per block. # my @block_count; # bw[block#][word]=#of occur my $dd=0; my $df=0; my $j; my $i; # for each block foreach $k (@blocks) { $df=0; #for each sequence in the block foreach $j (@ { $k } ) { #print "$dd--$df"; # for each word in the sequence.. by reference foreach $i ( @ { $$j } ) { if(exists $block_count[$dd]{$i}) { $block_count[$dd]{$i}++; } else { $block_count[$dd]{$i}=1; } #print ",$i"; } #print "\n"; $df++; } $dd++; } # for each block, calculate its weights. my @block_weights; my @block_weights_2; $dd = 0; foreach $k (@block_count) { # number of unique words in block my $numwords = scalar keys %{ $k }; # init the squared sum to zero $block_weights_2[$dd] = 0; #print "$dd"; foreach $j (keys %{ $k }) { my $temp = $block_count[$dd]{$j} / $numwords; $block_weights[$dd]{$j} = $temp; $block_weights_2[$dd] += ( $temp * $temp ); #print "$dd--$k{$j}\n"; #print ", $j:$block_count[$dd]{$j}", ":",$block_weights[$dd]{$j} } #print "\n"; $dd++; } my $num_gaps = $num_seq - (2 * $block_size) + 1; #print "$seq_size\tterms per sequence\n"; #print "$block_size\tsequences per block\n"; #print "$num_seq\tsequences in this doc\n"; #print "$num_blocks\tblocks in this doc\n"; #print "$num_gaps\tgaps in this doc\n"; # for each block gap, calculate the similarity. my @gap_sim; my @similarity; for ($k = 0; $k < $num_gaps; $k++) { my $b1 = $k; my $b2 = $k + $block_size; #calculate the bottom. my $bottom = sqrt( $block_weights_2[$b1] * $block_weights_2[$b2] ); #calculate the top my $top = 0; foreach $j (keys %{ $block_weights[$b1] } ) { if (exists $block_weights[$b2]{$j}) { $top += $block_weights[$b1]{$j} * $block_weights[$b2]{$j}; #print "$j\n"; } } #print "\n"; $similarity[$k] = $top / $bottom; print "$k\t$similarity[$k]\n"; #print "top:\t$top\nbottom:\t$bottom\nsimilarity:\t$similarity[$k]\n\n"; # foreach $j ( @{ $blocks[$b2]} ) # { # foreach $i ( @{ $$j } ) # { # print "$i\t$block_weights[$b2]{$i}\n"; # } # } } #### # # calcuate the depth scores # #### my @depths; open (LF, ">lf.out"); open (RF, ">rf.out"); open (DF, ">df.out"); ##### check results with and without -+ 1 in initi my $avg_depth = 0; for ($k = 0; $k < $num_gaps; $k++) { my $current = $similarity[$k]; #scan left until peak or oob my $left = $similarity[$k]; for($j = $k - 1; $j >= 0 and $left <= $similarity[$j]; $j--) { $left = $similarity[$j]; } #scan right until peak or oob my $right = $similarity[$k]; for($j = $k + 1; $j < $num_gaps and $right <= $similarity[$j]; $j++) { $right = $similarity[$j]; } #add and assign my $depth = ($left - $current) + ($right - $current); $depths[$k] = $depth; print LF "$k\t$left\n"; print RF "$k\t$right\n"; print DF "$k\t$depth\n"; $avg_depth += $depth; } $avg_depth /= $k; # get the average instead of sum. close (LF); close (RF); close (DF); #### # # use the depth peaks as the sepeation values # #### open (TB, ">tb.out"); open (WIH, ">wih.out"); for ($k = 1; $k < $num_gaps - 1; $k++) { my $left = $depths[$k-1]; my $current = $depths[$k]; my $right = $depths[$k+1]; my $limit = 1.5 * $avg_depth; if ($left < $current and $right < $current and $current > $limit) { print TB "$k\n"; my $srcline = $seq_lines[$block_size + $k]; print WIH "sequence:$k\t$lines[$srcline]->{time}\n"; # : $lines->text}" } } close (TB); close (WIH); #### # # figure out where in the lines it corresponds to. # # an array of each sequence and its starting line. # # #### # $seq_lines; # # my $src_line = $block_size + 8 ;