#!/usr/bin/perl use strict; use LWP::UserAgent; use HTTP::Request::Common qw(POST); use LogLine; use Getopt::Long; my $outdir = 'bylsa'; my $iterlimit = 9999999; GetOptions("outdir=s", \$outdir, "iterlimit=i", \$iterlimit); foreach my $file (@ARGV) { $file =~ m#/?([^/]*)$#; my $outlog = "$outdir/bylsa-$1"; print "writing to $outlog\n"; #open WRITE, ">$outlog" or (warn "Can't open $outlog: $!" and next); #my @lines = &LogLine::stringlistnonick(&LogLine::filterMarkup(&LogLine::filterServerMessages(LogLine::readFile($file)))) or warn "No data for $file: $!"; my @lines = &LogLine::filterMarkup(&LogLine::filterServerMessages(LogLine::readFile($file))) or warn "No data for $file: $!"; my @lsastrings; my @words; my @chunks; my $curchunktxt; my $numwords; #foreach my $line (@lines) { #my @wordsinline = split /\s+/, LogLine::stringnonick($line); #$curchunktxt .= $line->{raw}; #$numwords += @wordsinline; #push @words, @wordsinline; #if ($numwords >= 75) { #push @lsastrings, join ' ', @words; #$numwords = 0; #@words = [ ]; #push @chunks, $curchunktxt; #$curchunktxt = ''; #} #} #for (my $i = 0; $i < @lsastrings; ++$i) { #$lsastrings[$i] .= $lsastrings[$i+1] . $lsastrings[$i+2] . $lsastrings[$i+3]; #} my %nicklines; foreach my $line (@lines) { push @{$nicklines{$line->{nick}}}, $line->{text}; } foreach my $nick (sort keys %nicklines) { push @lsastrings, join "\n", @{$nicklines{$nick}}; } #my @words = split(/[\s\n]+/, join(" ", @lines)); #for (my $i = 0; $i < @words; $i += 75) { #push @lsastrings, join(" ", @words[$i..($i+300)]); #print $i*75..$i*75+300; #print $lsastrings[$#lsastrings] . "\n~~~~~~~~\n"; #} #for (my $i = 0; $i < @lsastrings -1 && $i < $iterlimit; $i++) { #print "$lsastrings[$i] ### $lsastrings[$i+1]\n"; #my @matrix = doLSA($lsastrings[$i], $lsastrings[$i+1]); #my @lead = split /\s+/, $lsastrings[$i+1]; #print WRITE "#similarity $matrix[0][1]\n" . join(' ', @lead[0..75]) . "\n"; #print WRITE $chunks[$i] . "#similarity $matrix[0][1]\n"; #foreach (@lsastrings) { #print; #print "\n#################\n"; #} my @matrix = doLSA(@lsastrings); print join "\t", sort keys %nicklines; print "\n"; foreach my $row (@matrix) { foreach my $col (@$row) { print $col . "\t"; } print "\n"; } print "********\n"; #} #print WRITE $chunks[$#chunks]; #close WRITE; print "Completed $file\n"; } sub doLSA { # pass "documents" (chunks of text) as args #LSA options my $semanticspace = "General_Reading_up_to_1st_year_college (300 factors)"; my $comparisontype = "doc2doc"; #my $teststring = "This is a test.\nDo you like the test?\r\n\r\nI love baby cheesy.\n"; my $submitstring = join "\r\n\r\n", @_; $submitstring =~ s/#//g; # lsa website seems to secretly use # to make new texts my $ua = new LWP::UserAgent; my $req = POST('http://128.138.223.70/cgi-bin/LSA-matrix-x.html', [ "LSAspace" => $semanticspace, "CmpType" => $comparisontype, "txt1" => $submitstring ]); #$req->content_type('application/x-www-form-urlencoded'); #$req->content("$semanticspace&$comparisontype&$teststring"); my $res; do { $res = $ua->request($req); if ($res->is_success) { #print $res->content; my $content = $res->content; $content =~ s/^.*\n\s*\n//s; $content =~ s#
\s*\n\s*\n

.*##s; my @rows = split /^\s*/m, $content; my @matrix; my @texts; my $linenum = 0; foreach (@rows) { chomp; my @fields = split /\s*
\s*/; shift @fields if $fields[0] eq ''; next unless @fields; if ($fields[0] eq "Document") { shift @fields; @texts = @fields; next; } else { shift @fields; for (my $i = 0; $i < @fields; $i++) { $matrix[$linenum][$i] = $fields[$i]; } $linenum++; } } #print "$matrix[0][0]\n$matrix[0][1]\n$matrix[1][0]\n$matrix[1][1]\n"; return @matrix; } else { print STDERR "Error connecting to LSA website: " . $res->status_line . ".\n"; #return undef; } } while (!$res->is_success && sleep 2); }