#! /usr/bin/perl

# http://www.xav.com/perl/site/lib/lwpcook.html

$year = 2013;

use LWP::Simple;

@alertbox = (	
	# "%I useit.com",
	"%A Jakob Nielsen",
	"%I nngroup.com", # as of end of 2012
	"%S Alertbox: Web Usability Newsletter",
	"%K hci-sites:alertbox",
	);

@uie = (
	"%I User Interface Engineering",
	"%K hci-sites:articles",
	);

@usability = (
	"%I Usability.gov",
	"%K hci-sites:articles",
	);

&refer($ARGV[0]);

sub cleanup {
	my ($doc) = (@_);
	$doc =~ s/&#39;/'/g;
	return $doc;
}

sub refer { # file
	local ($file) = (@_);
	$file = "http://" . $file unless ($file =~ m|^https?://|i);
	local ($doc) = get $file;
	$doc = &cleanup($doc);
	# print $doc;

	if ($ident = &ident($file)) {
		print "%M $ident\n";
	}
	print "%0 INTERNET\n";
	if ($file =~ /alertbox/) {
		&add(@alertbox);
	} elsif ($file =~ /usability[.]gov/) {
		&add(@usability);
	} elsif ($file =~ /nngroup.com\/articles/) {
		if ($doc =~ /Jakob Nielsen.{1,3}s Alertbox: (\w+) (\d+), (\d+)/) {
			($month, $day, $year) = ($1, $2, $3);
			$month = &monthNum($month);
			$day = "0$day" if $day < 10;
			print "%D $year-$month-$day\n";
			&add(@alertbox);
		}
	}
	local ($i);
	print "%W $file\n";
	if ($title = &title($doc)) {
		$title =~ s/ \(Jakob Nielsen's Alertbox\)//;
		$title =~ s/ \(Alertbox[^)]*\)//;
		print "%T $title\n";
	}
	if ($abstract = &abstract($doc)) {
		print "%X $abstract\n";
	}
	if (($keywords = &meta($doc, 'DC.keywords')) || ($keywords = &meta($doc, 'keywords'))) {
		print "%K $keywords\n";
	}
	if ($author = &author($file, $doc)) {
		print "%A $author\n";
	}
	if ($date = &date($file, $doc)) {
		print "%D $date\n";
	}
	if ($headings = &headings($doc)) {
		$headings = &notags($headnigs);
		print "%Y $headings\n";
	}
}

sub trim {
	local ($s) = (@_);
	$s =~ s/^\s*//;  # leading space
	$s =~ s/\s*$//;  # trailing space
	$s =~ s/[ 	][ 	]+/ /m;  # embedded multispace
	return $s;
}

sub meta {
	local ($doc, $name) = (@_);
	local ($metatag);
	if ($doc =~ m|(<meta [^>]*name="$name"[^>]*>)|si) {
		$metatag = $1;
		# print "debug metatag[$name] = $metatag\n";
		if ($metatag =~ m|content="([^"]*)"|si) {
			# print "debug content[$name]=$1\n";
			return &trim($1);
		}
	}
	return "";
}

sub notags {
	local ($doc) = (@_);
	$doc =~ s/<[^>]*>//gs;
	return $doc;
}

sub trim {
	local ($doc) = (@_);
	$doc =~ s/^\s+//gs;
	$doc =~ s/\s+$//gs;
	return $doc;
}

sub headings {
	local ($doc) = (@_);
	local (@headings, $heading);
	$doc =~ s/<h3>Share your thoughts.*$//is;
	$doc =~ s/<\/h2>//gi;
	while ($doc =~ /<h2[^>]*>([^]*)/i) {
		$heading = &trim($1);
		push @headings, $heading;
		$doc =~ s/<h2\b//i;
	}
	return join "\n", @headings;
}

sub abstract {
	local ($doc) = (@_);
	local ($abstract);
	if (($abstract = &meta($doc, "DC.description")) || ($abstract = &meta($doc, 'description'))) {
		return $abstract;
	}
	$doc =~ s/.*<div class="article-info">//si;
	$doc =~ s/<h3>Share your thoughts.*$//is;
	$doc =~ s/<h2\b.*$//si;
	$doc =~ s/.*<\/div>//si;
	$doc =~ s/<\/?p>//gsi;
	$doc =~ s/\n\n/\n   /s;    # keep at most 2 paras
	$doc =~ s/\n\n/\n   /s;    # keep at most 2 paras
	$doc =~ s/\n\n.*$//s;      # keep at most 2 paras
	return &trim(&notags($doc));
}

sub author {
	local ($file, $doc) = (@_);
	$doc =~ s|</div>||g;
	if ($doc =~ m|<p class="byline">By\s*([^<]*)</p>|im) {
		return $1;
	}
	if ($file =~ /alertbox/i) {
		return "Jakob Nielsen",
	}
	if ($file =~ /uie.com/i) {
		return "Jared M. Spool",
	}
	if ($doc =~ m|<div class="author-meta">by([^]*)|m) {
		my ($author) = $1;
		$author = &notags($author);
		$author = &trim($author);
		$author =~ s/\s*on\s*/\n%D /;
		return $author;
	}
	return "";
}

sub title {
	local ($doc) = (@_);
	local ($title) = &meta($doc, "DC.title");
	if ($title) {
		return &trim($title);
	}
	if ($doc =~ m|<title>([^<]*)</title>|i) {
		return &trim($1);
	}
	return "";
}

sub ident {
	local ($file) = (@_);
	if ($file =~ /alertbox/) {
		$file =~ s|.*/||;     # remove everything up to /
		$file =~ s|.htm.*||;  # remove file suffix
		return "U.Nielsen.$file";
	} elsif ($file =~ /uie.com/) {
		$file =~ s|/$||;
		$file =~ s|.*/||;
		return "U.$file.uie.com";
	} elsif ($file =~ /nngroup.com/) {
		# %W http://www.nngroup.com/reports/agile/
		$file =~ s|/$||;
		$file =~ s|.*/||;
		return "U.$file.nngroup.com";
	}
	$file =~ s|.*//||;
	return $file;
}

sub date {
	local ($file, $doc) = (@_);
	local ($date) =  &meta($doc, 'DC.date.created');
		# print "debug date = $date\n";
	if ($date) {
		$date =~ s/ .*$//; # trim off time
		return $date;
	}
	$file =~ s|.*/||;
	$file =~ s|.htm.*||;
	if ($file =~ /(\d\d\d\d)(\d\d)(\d\d)/) {
		return "$1-$2-$3";
	} elsif ($file =~ /(\d\d)(\d\d)(\d\d)/) {
		return "19$1-$2-$3";
	} elsif ($doc =~ /originally published:\s*([^<]*)</i) {
		$date = $1;
		local ($year, $month, $day);
		if ($date =~ /\b(\d\d\d\d\b)/) {
			$year = $1;
			$date =~ s///;
			$date =~ s/,\s*//;
			if ($date =~ /([a-z][a-z]+)/i) {
				$month = $1;
				$date =~ s///;
				$date =~ s/\s//g;
				$month = &monthNum($month);
				$date = "$year-$month-$date";
			} else {
				$date =~ s/ /-/;
				$date = "$year-$date";
			}
		}
		return $date;
	}
	return "";
}

sub add {
	my (@list) = (@_);
	for my $item (@list) {
		print "$item\n";
	}
}

sub monthNum {
	my ($month) = (@_);
	$month =~ s/Jan.*$/01/i;
	$month =~ s/Feb.*$/02/i;
	$month =~ s/Mar.*$/03/i;
	$month =~ s/Apr.*$/04/i;
	$month =~ s/May.*$/05/i;
	$month =~ s/Jun.*$/06/i;
	$month =~ s/Jul.*$/07/i;
	$month =~ s/Aug.*$/08/i;
	$month =~ s/Sep.*$/09/i;
	$month =~ s/Oct.*$/10/i;
	$month =~ s/Nov.*$/11/i;
	$month =~ s/Dec.*$/12/i;
	return $month;
}

