#!/usr/bin/perl

use strict;
use LogLine;
use Getopt::Long;
use Math::Trig;

my $outdir = 'bytrevseg';
my $numconversations = 2;
my $numiter = 10;
GetOptions("outdir=s", \$outdir, "numconv=i", \$numconversations, "numiter=i", \$numiter);

foreach my $file (@ARGV) {
	$file =~ m#/?([^/]*)$#;
	my $outlog = "$outdir/bytrevseg-$1";
	print "writing to $outlog\n";
	#open WRITE, ">$outlog" or (warn "Can't open $outlog: $!" and next);
	#my @lines = &LogLine::stringlistnonick(&LogLine::filterMarkup(&LogLine::filterServerMessages(LogLine::readFile($file)))) or warn "No data for $file: $!";
	my @lines = &LogLine::filterMarkup(&LogLine::filterServerMessages(LogLine::readFile($file))) or warn "No data for $file: $!";
	#my @linetext = LogLine::stringlistnonick(@lines);

	#my %histogram;
	#for (my $i = 0; $i < @linetext; ++$i) {
		#foreach my $word (split /\s+/, $linetext[$i]) {
			#$word =~ s/^\W+(.*)\W+$//;
			#$word = lc $word;
			#next if $word eq '';
			#$histogram{$word}[$i]++;
		#}
	#}
	
	my %globalhistogram;
	my %nickhistogram;
	my %numwords;
	foreach my $line (@lines) {
		foreach my $word (split /\s+/, $line->{text}) {
			$word =~ s/^\W+(.*)/$1/;
			$word =~ s/(.*)\W+/$1/;
			$word = lc $word;
			next if $word eq '';
			$globalhistogram{$word}++;
			$nickhistogram{$line->{nick}}{$word}++;
			$numwords{$line->{nick}}++;
		}
	}

	my @clusters;
	my @vectors;
	my @clustervectors;
	my @solovector;
	my %vectolinenum;
	my $keynum = 0;
	my %nickvector;

	foreach my $nick (sort keys %nickhistogram) {
		my $i = 0;
		foreach my $word (sort keys %globalhistogram) {
			$nickvector{$nick}[$i++] = $nickhistogram{$nick}{$word} / $numwords{$nick};
		}
	}


	#foreach my $key (sort keys %histogram) {
		#for (my $i = 0; $i < @{$histogram{$key}}; ++$i) {
			#$vectors[$i][$keynum] = $histogram{$key}[$i];
			#$vectolinenum{$vectors[$i]} = $i;
			#$solovector[$keynum] += $histogram{$key}[$i];
		#}
		#$keynum++;
	#}
#	foreach my $vec (@vectors) {
#		my $sum = 0;
#		foreach my $num (@$vec) {
#			$sum += $num;
#		}
#		foreach my $i (0..$#{$vec}) {
#			$vec->[$i] /= $sum unless $sum == 0;
#		}
#	}
	#print join '#', @{$vectors[0]};
	#print join '#', @{$vectors[2]};
			
	#print "Angle between vector one and two is " . &vectorAngle($vectors[1], $vectors[0]) . "\n";
	#print "Angle between vector one and three is " . &vectorAngle($vectors[2], $vectors[0]) . "\n";
	#print "Angle between vector four and sixteen is " . &vectorAngle($vectors[3], $vectors[15]) . "\n";
	#print "Distance between vector one and two is " . &vectorDistance($vectors[1], $vectors[0]) . "\n";
	#print "Distance between vector one and three is " . &vectorDistance($vectors[2], $vectors[0]) . "\n";
	#print "Distance between vector four and sixteen is " . &vectorDistance($vectors[3], $vectors[15]) . "\n";

	#foreach my $i (0..$numconversations-1) {
		#foreach my $j (0..((scalar keys %globalhistogram)-1)) {
			#$clusters[$i][$j] = rand(2) - 1;
		#}
	#}

	 my %whichcluster;
	 foreach my $nick (keys %nickvector) {
		 my $cluster = int rand $numconversations;
		 push @{$clustervectors[$cluster]}, $nickvector{$nick};
		 $whichcluster{$nick} = $cluster;
	 }

	 foreach my $i (0..$numiter) {
		 #rebalance
		 foreach my $cluster (0..$numconversations-1) {
			 @{$clusters[$cluster]} = vectorMean(@{$clustervectors[$cluster]});
		 }
		 #
		 #cluster
		 @clustervectors = ();
		 #foreach my $vec (@vectors) {
			 #my $cluster = classify($vec, @clusters);
			 #push @{$clustervectors[$cluster]}, $vec;
		 #}
		 foreach my $nick (keys %nickvector) {
			 my $cluster = classify($nickvector{$nick}, @clusters);
			 push @{$clustervectors[$cluster]}, $nickvector{$nick};
			 $whichcluster{$nick} = $cluster;
			 #print "$nick in $cluster\n";
		 }
	 }

	 #foreach my $i (0..$#clusters) {
		 #print "*** Cluster $i ***\n";
		 #foreach my $vec (@{$clustervectors[$i]}) {
			 #print $lines[$vectolinenum{$vec}]->{raw};
		 #}
	 #}
	 foreach my $nick (keys %whichcluster) {
		 print "$nick is in cluster $whichcluster{$nick}\n";
	 }
}

sub vectorAngle {
	my ($ref1, $ref2) = @_;
	my @vec1 = @$ref1;
	my @vec2 = @$ref2;

	my $sum;
	for (my $i = 0; $i < @vec1; ++$i) {
		$sum += $vec1[$i]*$vec2[$i];
	}
	my $angle = cos($sum);
	return $angle;
}

sub vectorDistance {
	my ($ref1, $ref2) = @_;
	my @vec1 = @$ref1;
	my @vec2 = @$ref2;

	my $sum = 0;
	foreach my $i (0..$#vec1) {
		$sum += ($vec1[$i] - $vec2[$i])**2;
	}
	return sqrt $sum;
}

sub vectorMean {
	my @means;
	my $numvecs = scalar @_;
	while (my $ref = shift) {
		foreach my $i (0..$#{$ref}) {
			$means[$i] += $ref->[$i];
		}
	}
	foreach my $i (0..$#means) {
		$means[$i] /= $numvecs;
	}
	return @means;
}

sub classify {
	my $ref = shift;
	my $closest = 0;
	my $closestdist = vectorDistance($ref, $_[0]);
	foreach my $i (1..$#_) {
		if ((my $dist = vectorDistance($ref, $_[$i])) < $closestdist) {
			$closest = $i;
			$closestdist = $dist;
		}
	}
	return $closest;
}
