#!/usr/bin/perl

use strict;
use LogLine;
use Getopt::Long;
use Math::Trig;

my $outdir = 'bytrevseg';
my $numconversations = 2;
my $numiter = 10;
GetOptions("outdir=s", \$outdir, "numconv=i", \$numconversations, "numiter=i", \$numiter);

foreach my $file (@ARGV) {
	$file =~ m#/?([^/]*)$#;
	my $outlog = "$outdir/bytrevseg-$1";
	print "writing to $outlog\n";
	#open WRITE, ">$outlog" or (warn "Can't open $outlog: $!" and next);
	#my @lines = &LogLine::stringlistnonick(&LogLine::filterMarkup(&LogLine::filterServerMessages(LogLine::readFile($file)))) or warn "No data for $file: $!";
	my @lines = &LogLine::filterMarkup(&LogLine::filterServerMessages(LogLine::readFile($file))) or warn "No data for $file: $!";
	my @linetext = LogLine::stringlistnonick(@lines);

	my %histogram;
	for (my $i = 0; $i < @linetext; ++$i) {
		foreach my $word (split /\s+/, $linetext[$i]) {
			$word =~ s/^\W+(.*)\W+$//;
			$word = lc $word;
			next if $word eq '';
			$histogram{$word}[$i]++;
		}
	}

	my @clusters;
	my @vectors;
	my @clustervectors;
	my @solovector;
	my %vectolinenum;
	my $keynum = 0;
	foreach my $key (sort keys %histogram) {
		for (my $i = 0; $i < @{$histogram{$key}}; ++$i) {
			$vectors[$i][$keynum] = $histogram{$key}[$i];
			$vectolinenum{$vectors[$i]} = $i;
			$solovector[$keynum] += $histogram{$key}[$i];
		}
		$keynum++;
	}
	foreach my $vec (@vectors) {
		my $sum = 0;
		foreach my $num (@$vec) {
			$sum += $num;
		}
		foreach my $i (0..$#{$vec}) {
			$vec->[$i] /= $sum unless $sum == 0;
		}
	}
	print join '#', @{$vectors[0]};
	print join '#', @{$vectors[2]};
			
	print "Angle between vector one and two is " . &vectorAngle($vectors[1], $vectors[0]) . "\n";
	print "Angle between vector one and three is " . &vectorAngle($vectors[2], $vectors[0]) . "\n";
	print "Angle between vector four and sixteen is " . &vectorAngle($vectors[3], $vectors[15]) . "\n";
	print "Distance between vector one and two is " . &vectorDistance($vectors[1], $vectors[0]) . "\n";
	print "Distance between vector one and three is " . &vectorDistance($vectors[2], $vectors[0]) . "\n";
	print "Distance between vector four and sixteen is " . &vectorDistance($vectors[3], $vectors[15]) . "\n";

	foreach my $i (0..$numconversations) {
		 foreach my $j (0..$#solovector) {
			 $clusters[$i][$j] = rand(2) - 1;
		 }
	 }

	 foreach my $i (0..$numiter) {
		 #cluster
		 foreach my $vec (@vectors) {
			 my $cluster = classify($vec, @clusters);
			 push @{$clustervectors[$cluster]}, $vec;
		 }
		 #rebalance
		 foreach my $cluster (0..$#clusters) {
			 @{$clusters[$cluster]} = vectorMean(@{$clustervectors[$cluster]});
		 }
	 }

	 foreach my $i (0..$#clusters) {
		 print "*** Cluster $i ***\n";
		 foreach my $vec (@{$clustervectors[$i]}) {
			 print $lines[$vectolinenum{$vec}]->{raw};
		 }
	 }
}

sub vectorAngle {
	my ($ref1, $ref2) = @_;
	my @vec1 = @$ref1;
	my @vec2 = @$ref2;

	my $sum;
	for (my $i = 0; $i < @vec1; ++$i) {
		$sum += $vec1[$i]*$vec2[$i];
	}
	my $angle = cos($sum);
	return $angle;
}

sub vectorDistance {
	my ($ref1, $ref2) = @_;
	my @vec1 = @$ref1;
	my @vec2 = @$ref2;

	my $sum = 0;
	foreach my $i (0..$#vec1) {
		$sum += ($vec1[$i] - $vec2[$i])**2;
	}
	return sqrt $sum;
}

sub vectorMean {
	my @means;
	my $numvecs = scalar @_;
	while (my $ref = shift) {
		foreach my $i (0..$#{$ref}) {
			$means[$i] += $ref->[$i];
		}
	}
	foreach my $i (0..$#means) {
		$means[$i] /= $numvecs;
	}
	return @means;
}

sub classify {
	my $ref = shift;
	my $closest = 0;
	my $closestdist = vectorDistance($ref, $_[0]);
	foreach my $i (1..$#_) {
		if ((my $dist = vectorDistance($ref, $_[$i])) < $closestdist) {
			$closest = $i;
			$closestdist = $dist;
		}
	}
	return $closestdist;
}
