#!/usr/bin/perl use strict; use LogLine; use Getopt::Long; use Math::Trig; my $outdir = 'bytrevseg'; my $numconversations = 2; my $numiter = 10; GetOptions("outdir=s", \$outdir, "numconv=i", \$numconversations, "numiter=i", \$numiter); foreach my $file (@ARGV) { $file =~ m#/?([^/]*)$#; my $outlog = "$outdir/bytrevseg-$1"; print "writing to $outlog\n"; #open WRITE, ">$outlog" or (warn "Can't open $outlog: $!" and next); #my @lines = &LogLine::stringlistnonick(&LogLine::filterMarkup(&LogLine::filterServerMessages(LogLine::readFile($file)))) or warn "No data for $file: $!"; my @lines = &LogLine::filterMarkup(&LogLine::filterServerMessages(LogLine::readFile($file))) or warn "No data for $file: $!"; #my @linetext = LogLine::stringlistnonick(@lines); #my %histogram; #for (my $i = 0; $i < @linetext; ++$i) { #foreach my $word (split /\s+/, $linetext[$i]) { #$word =~ s/^\W+(.*)\W+$//; #$word = lc $word; #next if $word eq ''; #$histogram{$word}[$i]++; #} #} my %globalhistogram; my %nickhistogram; my %numwords; foreach my $line (@lines) { foreach my $word (split /\s+/, $line->{text}) { $word =~ s/^\W+(.*)/$1/; $word =~ s/(.*)\W+/$1/; $word = lc $word; next if $word eq ''; $globalhistogram{$word}++; $nickhistogram{$line->{nick}}{$word}++; $numwords{$line->{nick}}++; } } my @clusters; my @vectors; my @clustervectors; my @solovector; my %vectolinenum; my $keynum = 0; my %nickvector; foreach my $nick (sort keys %nickhistogram) { my $i = 0; foreach my $word (sort keys %globalhistogram) { $nickvector{$nick}[$i++] = $nickhistogram{$nick}{$word} / $numwords{$nick}; } } #foreach my $key (sort keys %histogram) { #for (my $i = 0; $i < @{$histogram{$key}}; ++$i) { #$vectors[$i][$keynum] = $histogram{$key}[$i]; #$vectolinenum{$vectors[$i]} = $i; #$solovector[$keynum] += $histogram{$key}[$i]; #} #$keynum++; #} # foreach my $vec (@vectors) { # my $sum = 0; # foreach my $num (@$vec) { # $sum += $num; # } # foreach my $i (0..$#{$vec}) { # $vec->[$i] /= $sum unless $sum == 0; # } # } #print join '#', @{$vectors[0]}; #print join '#', @{$vectors[2]}; #print "Angle between vector one and two is " . &vectorAngle($vectors[1], $vectors[0]) . "\n"; #print "Angle between vector one and three is " . &vectorAngle($vectors[2], $vectors[0]) . "\n"; #print "Angle between vector four and sixteen is " . &vectorAngle($vectors[3], $vectors[15]) . "\n"; #print "Distance between vector one and two is " . &vectorDistance($vectors[1], $vectors[0]) . "\n"; #print "Distance between vector one and three is " . &vectorDistance($vectors[2], $vectors[0]) . "\n"; #print "Distance between vector four and sixteen is " . &vectorDistance($vectors[3], $vectors[15]) . "\n"; #foreach my $i (0..$numconversations-1) { #foreach my $j (0..((scalar keys %globalhistogram)-1)) { #$clusters[$i][$j] = rand(2) - 1; #} #} my %whichcluster; foreach my $nick (keys %nickvector) { my $cluster = int rand $numconversations; push @{$clustervectors[$cluster]}, $nickvector{$nick}; $whichcluster{$nick} = $cluster; } foreach my $i (0..$numiter) { #rebalance foreach my $cluster (0..$numconversations-1) { @{$clusters[$cluster]} = vectorMean(@{$clustervectors[$cluster]}); } # #cluster @clustervectors = (); #foreach my $vec (@vectors) { #my $cluster = classify($vec, @clusters); #push @{$clustervectors[$cluster]}, $vec; #} foreach my $nick (keys %nickvector) { my $cluster = classify($nickvector{$nick}, @clusters); push @{$clustervectors[$cluster]}, $nickvector{$nick}; $whichcluster{$nick} = $cluster; #print "$nick in $cluster\n"; } } #foreach my $i (0..$#clusters) { #print "*** Cluster $i ***\n"; #foreach my $vec (@{$clustervectors[$i]}) { #print $lines[$vectolinenum{$vec}]->{raw}; #} #} foreach my $nick (keys %whichcluster) { print "$nick is in cluster $whichcluster{$nick}\n"; } } sub vectorAngle { my ($ref1, $ref2) = @_; my @vec1 = @$ref1; my @vec2 = @$ref2; my $sum; for (my $i = 0; $i < @vec1; ++$i) { $sum += $vec1[$i]*$vec2[$i]; } my $angle = cos($sum); return $angle; } sub vectorDistance { my ($ref1, $ref2) = @_; my @vec1 = @$ref1; my @vec2 = @$ref2; my $sum = 0; foreach my $i (0..$#vec1) { $sum += ($vec1[$i] - $vec2[$i])**2; } return sqrt $sum; } sub vectorMean { my @means; my $numvecs = scalar @_; while (my $ref = shift) { foreach my $i (0..$#{$ref}) { $means[$i] += $ref->[$i]; } } foreach my $i (0..$#means) { $means[$i] /= $numvecs; } return @means; } sub classify { my $ref = shift; my $closest = 0; my $closestdist = vectorDistance($ref, $_[0]); foreach my $i (1..$#_) { if ((my $dist = vectorDistance($ref, $_[$i])) < $closestdist) { $closest = $i; $closestdist = $dist; } } return $closest; }