#!/usr/bin/perl

use HTML::Entities;


$doctype = "<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>\n";

$filestring = "";

foreach (<>)
  {
    ## SOME REGEXS TO STRIP OUT west HTML
    chomp;
    $filestring = $filestring . $_;
  }

#print $filestring;

$filestring =~ s/\.htm//g;
$filestring =~ s/<span class=font\d>//g;
$filestring =~ s/<\/span>//g;
#$filestring =~ s/<(SPAN|IMG|td|tr|table)[^>]+>//g;
## now remove styles 
$filestring =~ s/<style\stype="text\/css">.+<\/style>//g;
$filestring =~ s/<meta\b[^>]+>//g;
$filestring =~ s#<p></p>##g;

## try and deal with footnotes
#$filestring =~ s/\[FN(\d+)\]/<sup>$1<\/sup>/g;
#$filestring =~ s/FN(\d+)\./<sup>$1<\/sup>/g;
#$filestring =~ s/<a\b[^>]+>//g;
#$filestring =~ s/<\/a>//g;


## put in anchors for original pages breaks
#$filestring =~ s/<b>\*(\d\d?)<\/b>/<a name="pagebreak#$1"><\/a>/g;

$filestring =~ s#<(p|div|h2|h3|h1)\b[^>]+>#<$1>#g;
$filestring =~ s#<div>#<div>\n#g;
$filestring =~ s#</div>#</div>\n#g;
$filestring =~ s#<p>&nbsp;</p>##g;
$filestring =~ s#</p>#</p>\n\n#g;
$filestring =~ s#<body>#<body>\n#g;
$filestring =~ s#</body>#</body>\n#g;
$filestring =~ s#</head>#</head>\n#g;
$filestring =~ s#<head>#<head>\n#g;
$filestring =~ s#</title>#</title>\n#g;
$filestring =~ s#<html>#\n<html>\n#g;
$filestring =~ s#<html>#\n<html xmlns="http://www.w3.org/1999/xhtml">\n#g;
$filestring =~ s#<html(\b[^>]+)>#\n<html$1>\n#g;
$filestring =~ s#dtd">>#dtd">\n#g;
$filestring =~ s#Untitled\sDocument##g;
$filestring =~ s#<!DOCTYPE\b[^>]+>##g;
$filestring =~ s#<\?xml\b[^>]+>##g;
$filestring =~ s#<font\b[^>]+>##g;
$filestring =~ s#</font>##g;
$filestring =~ s#</h2>#</h2>\n#g;
$filestring =~ s#</h3>#</h3>\n#g;

print $doctype;
print $filestring;
