#!/usr/bin/perl use strict; use LogLine; use Getopt::Long; use Math::Trig; my $outdir = 'bytrevseg'; my $numconversations = 2; my $numiter = 10; GetOptions("outdir=s", \$outdir, "numconv=i", \$numconversations, "numiter=i", \$numiter); foreach my $file (@ARGV) { $file =~ m#/?([^/]*)$#; my $outlog = "$outdir/bytrevseg-$1"; print "writing to $outlog\n"; #open WRITE, ">$outlog" or (warn "Can't open $outlog: $!" and next); #my @lines = &LogLine::stringlistnonick(&LogLine::filterMarkup(&LogLine::filterServerMessages(LogLine::readFile($file)))) or warn "No data for $file: $!"; my @lines = &LogLine::filterMarkup(&LogLine::filterServerMessages(LogLine::readFile($file))) or warn "No data for $file: $!"; my @linetext = LogLine::stringlistnonick(@lines); my %histogram; for (my $i = 0; $i < @linetext; ++$i) { foreach my $word (split /\s+/, $linetext[$i]) { $word =~ s/^\W+(.*)\W+$//; $word = lc $word; next if $word eq ''; $histogram{$word}[$i]++; } } my @clusters; my @vectors; my @clustervectors; my @solovector; my %vectolinenum; my $keynum = 0; foreach my $key (sort keys %histogram) { for (my $i = 0; $i < @{$histogram{$key}}; ++$i) { $vectors[$i][$keynum] = $histogram{$key}[$i]; $vectolinenum{$vectors[$i]} = $i; $solovector[$keynum] += $histogram{$key}[$i]; } $keynum++; } foreach my $vec (@vectors) { my $sum = 0; foreach my $num (@$vec) { $sum += $num; } foreach my $i (0..$#{$vec}) { $vec->[$i] /= $sum unless $sum == 0; } } print join '#', @{$vectors[0]}; print join '#', @{$vectors[2]}; print "Angle between vector one and two is " . &vectorAngle($vectors[1], $vectors[0]) . "\n"; print "Angle between vector one and three is " . &vectorAngle($vectors[2], $vectors[0]) . "\n"; print "Angle between vector four and sixteen is " . &vectorAngle($vectors[3], $vectors[15]) . "\n"; print "Distance between vector one and two is " . &vectorDistance($vectors[1], $vectors[0]) . "\n"; print "Distance between vector one and three is " . &vectorDistance($vectors[2], $vectors[0]) . "\n"; print "Distance between vector four and sixteen is " . &vectorDistance($vectors[3], $vectors[15]) . "\n"; foreach my $i (0..$numconversations) { foreach my $j (0..$#solovector) { $clusters[$i][$j] = rand(2) - 1; } } foreach my $i (0..$numiter) { #cluster foreach my $vec (@vectors) { my $cluster = classify($vec, @clusters); push @{$clustervectors[$cluster]}, $vec; } #rebalance foreach my $cluster (0..$#clusters) { @{$clusters[$cluster]} = vectorMean(@{$clustervectors[$cluster]}); } } foreach my $i (0..$#clusters) { print "*** Cluster $i ***\n"; foreach my $vec (@{$clustervectors[$i]}) { print $lines[$vectolinenum{$vec}]->{raw}; } } } sub vectorAngle { my ($ref1, $ref2) = @_; my @vec1 = @$ref1; my @vec2 = @$ref2; my $sum; for (my $i = 0; $i < @vec1; ++$i) { $sum += $vec1[$i]*$vec2[$i]; } my $angle = cos($sum); return $angle; } sub vectorDistance { my ($ref1, $ref2) = @_; my @vec1 = @$ref1; my @vec2 = @$ref2; my $sum = 0; foreach my $i (0..$#vec1) { $sum += ($vec1[$i] - $vec2[$i])**2; } return sqrt $sum; } sub vectorMean { my @means; my $numvecs = scalar @_; while (my $ref = shift) { foreach my $i (0..$#{$ref}) { $means[$i] += $ref->[$i]; } } foreach my $i (0..$#means) { $means[$i] /= $numvecs; } return @means; } sub classify { my $ref = shift; my $closest = 0; my $closestdist = vectorDistance($ref, $_[0]); foreach my $i (1..$#_) { if ((my $dist = vectorDistance($ref, $_[$i])) < $closestdist) { $closest = $i; $closestdist = $dist; } } return $closestdist; }