#!/usr/bin/perl
#
# usage:
#   ./parselog.pl --logdir=dirname
#   ./parselog.pl --onefile=filename


use strict;
use Getopt::Long;
use LogLine;


package main;

my ($logdir, $onefile) = ('.', '');
GetOptions("logdir=s" => \$logdir, "onefile=s" => \$onefile);

my @files = $onefile ? ($onefile) : <$logdir/*>;
die "No files to use" unless @files;

my @lines;




my $baba =0;
foreach my $file (@files) {
	open READ, $file;
	while (<READ>) {
		my $line = LogLine->new($_);
		push @lines, $line if defined $line;

		#if ( defined $line )
		#{
		#    print $baba++, ":", $line->{text}, "\n";
		#}
	}
}
#print scalar @lines, " total lines.\n";






####
#
# at this point, @ lines contains a bunch of hashes that have theline and its info
#
####

my $seq_size = $ARGV[0];
my $block_size = $ARGV[1];



my @sequences;
my $cseq=0;
my $ctok=0;


#
# this loop takes the lines of text, and turns them into sequences
# 
my $k;

# these two lines are for remembering where sequences come from.
my $cur_line = 0;
my @seq_lines;
$seq_lines[0] = 0;

foreach $k (@lines)
{
    my $tline = $k->{text};

    # get rid of excess puncuation
    chomp $tline;
    $tline =~ s/[^A-Za-z0-9_\']/\ /g; # keep only letters
    $tline =~ s/\ +/\ /g; # kill extra spaces
    $tline =~ s/^\ //;

##print "$tline\n";

    # split the words.
    my @words = split / /, $tline;
    
    # start loading the sequences.
    # into the current sequence, push the word.
    # 
    my $j = 0;
    foreach $j (@words)
    {
	# if the its time to start a new sequence.
	if ($ctok >= $seq_size)
	{
	    $cseq++;
	    $ctok = 0;
	   
	    # indicate that the new sequence starts with line ...
	    $seq_lines[$cseq]=$cur_line;
	}
	
#print "-$cseq-$ctok--$j\n";
	$sequences[$cseq][$ctok] = $j;

	$ctok++;
    }    
    $cur_line++; #remember, this is for figuring stuff out later
}
# print "++$cseq++", scalar @ { $sequences[$cseq] }, "\n";
# there are now sequences that are $seq_size in length.
# the last one is not complete in size.








####
#
# assign blocks their appropriate sequences
#
####

my @blocks;
my $num_seq = $cseq + 1; #total num of sequences
my $base_seq=0;
my $cblk=0;


#print "there are $num_seq total sequences\n";

while ($base_seq + $block_size <= $num_seq)
{
#print "current block: $cblk\t$base_seq --> ";
    
    my $seq = 0;

    # for the sequences that should be in this block.
    #
    # the current blocks sequences get set to the appropriate range
    # of sequences (via references) of the sequences array.
    my $h = 0;
    for ($h = $base_seq; $h < $base_seq + $block_size; $h++)
    {
	# the current blocks seq num is set to a ref
	$blocks[$cblk][$seq++] = \$sequences[$h];
    }
    

#print $h-1,"\n";

    # move to the next block and also change the sequence range.
    $cblk++;
    $base_seq++;
}




my $num_blocks = $cblk;



#test out the above.. make sure we can get to the words.
#
# at the same time, get word counts per block.
#

my @block_count; # bw[block#][word]=#of occur
my $dd=0;
my $df=0;
my $j;
my $i;
# for each block
foreach $k (@blocks)
{
    $df=0;
    #for each sequence in the block
    foreach $j (@ { $k } )
    {
#print "$dd--$df";
	
	# for each word in the sequence.. by reference
	foreach $i ( @ { $$j } )
	{
	    if(exists $block_count[$dd]{$i})
	    {
		$block_count[$dd]{$i}++;
	    }
	    else
	    {
		$block_count[$dd]{$i}=1;
	    }
	
#print ",$i";
	}
#print "\n";
	$df++;
    }
    $dd++;
}




# for each block, calculate its weights.

my @block_weights;
my @block_weights_2;
$dd = 0;
foreach $k (@block_count)
{
    # number of unique words in block
    my $numwords = scalar keys %{ $k };

    # init the squared sum to zero
    $block_weights_2[$dd] = 0;
    
    #print "$dd";
    foreach $j (keys %{ $k })
    {
	my $temp = $block_count[$dd]{$j} / $numwords;
	$block_weights[$dd]{$j} = $temp;
	$block_weights_2[$dd] += ( $temp * $temp ); 	

	#print "$dd--$k{$j}\n";
	#print ", $j:$block_count[$dd]{$j}", ":",$block_weights[$dd]{$j} 
    }
    #print "\n";

    
    $dd++;
}







my $num_gaps = $num_seq - (2 * $block_size) + 1;
#print "$seq_size\tterms per sequence\n";
#print "$block_size\tsequences per block\n";
#print "$num_seq\tsequences in this doc\n";
#print "$num_blocks\tblocks in this doc\n";
#print "$num_gaps\tgaps in this doc\n";








# for each block gap, calculate the similarity.
my @gap_sim;
my @similarity;
for ($k = 0; $k < $num_gaps; $k++)
{
    my $b1 = $k;
    my $b2 = $k + $block_size;
    


    #calculate the bottom.
    my $bottom = sqrt( $block_weights_2[$b1] * $block_weights_2[$b2] );

    
    #calculate the top
    my $top = 0;

    
    foreach $j (keys %{ $block_weights[$b1] } )
    {
	if (exists $block_weights[$b2]{$j})
	{
	    $top += $block_weights[$b1]{$j} * $block_weights[$b2]{$j};
#print "$j\n";
	}
    }
#print "\n";
    
    $similarity[$k] = $top / $bottom;
 

print "$k\t$similarity[$k]\n";

    
#print "top:\t$top\nbottom:\t$bottom\nsimilarity:\t$similarity[$k]\n\n";
    
#    foreach $j ( @{ $blocks[$b2]} )
#    {
#	foreach $i ( @{ $$j } )
#	{
#	    print "$i\t$block_weights[$b2]{$i}\n";
#	}
#   }

}





####
#
# calcuate the depth scores
#
####

my @depths;


open (LF, ">lf.out");
open (RF, ">rf.out");
open (DF, ">df.out");



##### check results with and without -+ 1 in initi

my $avg_depth = 0;

for ($k = 0; $k < $num_gaps; $k++)
{
    my $current = $similarity[$k];
    
    #scan left until peak or oob
    my $left = $similarity[$k];
    for($j = $k - 1; $j >= 0 and $left <= $similarity[$j]; $j--)
    {
	$left = $similarity[$j];
    }
    
    #scan right until peak or oob
    my $right = $similarity[$k];
    for($j = $k + 1; $j < $num_gaps and $right <= $similarity[$j]; $j++)
    {
	$right = $similarity[$j];
    }
    
    #add and assign
    my $depth = ($left - $current) + ($right - $current);
    $depths[$k] = $depth;
    
    print LF "$k\t$left\n";
    print RF "$k\t$right\n";
    print DF "$k\t$depth\n";

    $avg_depth += $depth;
}
$avg_depth /= $k; # get the average instead of sum.

close (LF);
close (RF);
close (DF);


####
#
# use the depth peaks as the sepeation values
#
####

open (TB, ">tb.out");
open (WIH, ">wih.out");

for ($k = 1; $k < $num_gaps - 1; $k++)
{
    my $left = $depths[$k-1];
    my $current = $depths[$k];
    my $right = $depths[$k+1];

    my $limit = 1.5 * $avg_depth;
    if ($left < $current and $right < $current and $current > $limit)
    {
	print TB "$k\n";
	my $srcline = $seq_lines[$block_size + $k];
	print WIH "sequence:$k\t$lines[$srcline]->{time}\n"; # : $lines->text}"
    }
}

close (TB);
close (WIH);



####
#
# figure out where in the lines it corresponds to.
#
# an array of each sequence and its starting line.
#
# ####
# $seq_lines;
#
#

my $src_line = $block_size + 8 ;




























