#!/usr/bin/perl

use strict;
use LWP::UserAgent;
use HTTP::Request::Common qw(POST);
use LogLine;
use Getopt::Long;

my $outdir = 'bylsa';
my $iterlimit = 9999999;
GetOptions("outdir=s", \$outdir, "iterlimit=i", \$iterlimit);

foreach my $file (@ARGV) {
	$file =~ m#/?([^/]*)$#;
	my $outlog = "$outdir/bylsa-$1";
	print "writing to $outlog\n";
	open WRITE, ">$outlog" or (warn "Can't open $outlog: $!" and next);
	#my @lines = &LogLine::stringlistnonick(&LogLine::filterMarkup(&LogLine::filterServerMessages(LogLine::readFile($file)))) or warn "No data for $file: $!";
	my @lines = &LogLine::filterMarkup(&LogLine::filterServerMessages(LogLine::readFile($file))) or warn "No data for $file: $!";
	my @lsastrings;
	my @words;
	my @chunks;
	my $curchunktxt;
	my $numwords;
	foreach my $line (@lines) {
		my @wordsinline = split /\s+/, LogLine::stringnonick($line);
		$curchunktxt .= $line->{raw};
		$numwords += @wordsinline;
		push @words, @wordsinline;
		if ($numwords >= 75) {
			push @lsastrings, join ' ', @words;
			$numwords = 0;
			@words = [ ];
			push @chunks, $curchunktxt;
			$curchunktxt = '';
		}
	}
	for (my $i = 0; $i < @lsastrings; ++$i) {
		$lsastrings[$i] .= $lsastrings[$i+1] . $lsastrings[$i+2] . $lsastrings[$i+3];
	}

	#my @words = split(/[\s\n]+/, join(" ", @lines));
	#for (my $i = 0; $i < @words; $i += 75) {
		#push @lsastrings, join(" ", @words[$i..($i+300)]);
		#print $i*75..$i*75+300;
		#print $lsastrings[$#lsastrings] . "\n~~~~~~~~\n";
	#}
	for (my $i = 0; $i < @lsastrings -1 && $i < $iterlimit; $i++) {
		#print "$lsastrings[$i] ### $lsastrings[$i+1]\n";
		my @matrix = doLSA($lsastrings[$i], $lsastrings[$i+1]);
		#my @lead = split /\s+/, $lsastrings[$i+1];
		#print WRITE "#similarity $matrix[0][1]\n" . join(' ', @lead[0..75]) . "\n";
		print WRITE $chunks[$i] . "#similarity $matrix[0][1]\n";
		foreach my $row (@matrix) {
			foreach my $col (@$row) {
				print $col . "\t";
			}
			print "\n";
		}
		print "********\n";
	}
	print WRITE $chunks[$#chunks];
	close WRITE;
	print "Completed $file\n";
}

sub doLSA {  # pass "documents" (chunks of text) as args
#LSA options
my $semanticspace = "General_Reading_up_to_1st_year_college 	 (300 factors)";
my $comparisontype = "doc2doc";
#my $teststring = "This is a test.\nDo you like the test?\r\n\r\nI love baby cheesy.\n";
my $submitstring = join "\r\n\r\n", @_;
$submitstring =~ s/#//g;  # lsa website seems to secretly use # to make new texts

my $ua = new LWP::UserAgent;
my $req = POST('http://128.138.223.70/cgi-bin/LSA-matrix-x.html', [ "LSAspace" => $semanticspace, "CmpType" => $comparisontype, "txt1" => $submitstring ]);
#$req->content_type('application/x-www-form-urlencoded');
#$req->content("$semanticspace&$comparisontype&$teststring");

my $res;
do {
	$res = $ua->request($req);

if ($res->is_success) {
	#print $res->content;
	my $content = $res->content;
	$content =~ s/^.*\n<TABLE BORDER>\s*\n//s;
	$content =~ s#</TABLE>\s*\n\s*\n<P><TABLE NOBORDER>.*##s;
	my @rows = split /^<TR>\s*/m, $content;
	my @matrix;
	my @texts;
	my $linenum = 0;
	foreach (@rows) {
		chomp;
		my @fields = split /\s*<TD ALIGN=CENTER>\s*/;
		shift @fields if $fields[0] eq '';
		next unless @fields;
		if ($fields[0] eq "Document") {
			shift @fields;
			@texts = @fields;
			next;
		}
		else {
			shift @fields;
			for (my $i = 0; $i < @fields; $i++) {
				$matrix[$linenum][$i] = $fields[$i];
			}
			$linenum++;
		}
	}
	#print "$matrix[0][0]\n$matrix[0][1]\n$matrix[1][0]\n$matrix[1][1]\n";
	return @matrix;
} else {
	print STDERR "Error connecting to LSA website: " . $res->status_line . ".\n";
	#return undef;
}
} while (!$res->is_success && sleep 2);
}
