Fixes some common XHTML validation errors, to make a web page pass the W3C markup validator. Does this still work? I have no idea, I never build web pages by hand anymore.
#!/usr/bin/perl -cw
#
# Clean up a few common xhtml validation errors
#
# Arguments: provide any number of filenames on the command line
# Each file will be cleaned, with original saved as file.bak[.num]
{
# Parser class to replace font tags with suitable values
package CleanupParser;
use base 'HTML::Parser';
# presentational attributes to remove from all tags
# exception: <img> gets to keep width and height
my @STRIP_ATTR = ('border', 'cellpadding', 'cellspacing',
'width', 'height', 'align', 'valign', 'hspace', 'vspace',
'target', 'onmouse', 'onmouseover', 'onclick', 'nowrap'
);
# presentation tags to remove
my @STRIP_TAGS = (
'center',
);
# Font substitution table
my %fontsub = (
#font size => replacement tag
'1' => '',
'2' => '', # 2 is normal size, just delete
'3' => 'h4',
'4' => 'h3',
'5' => 'h2',
'6' => 'h1',
'+1' => 'h3',
'+2' => 'h2',
'+3' => 'h1',
'+4' => 'h1',
'+5' => 'h1',
'+6' => 'h1',
'-1' => '',
'-2' => '',
'-3' => '',
'-4' => '',
'-5' => '',
'-6' => '',
);
# Store font closing tags on this stack
my @fontstack = ();
# Store processed document in this string
my $result = '';
# Form the correct closing tag from an open tag w/ attributes
sub close_tag {
my ($tag) = @_;
return '/' . (split (/\s+/, $tag))[0];
}
# Create a text representation of a tag
# Uses attrseq for attributes; values in attr but not attrseq won't
# be copied. This allows stripping attributes while still saving
# their value for later processing.
sub make_text {
my ($self, $tagname, $attr, $attrseq) = @_;
my $text = "<$tagname";
# The / on /> closing marker for strict xhtml tags is not an
# attribute but parser catches it as one. We remedy that here.
foreach my $key (grep { $_ ne '/' } @$attrseq) {
my $val = $attr->{$key};
$text .= " $key=\"$val\"";
}
$text .= exists $attr->{'/'} ? ' />' : '>';
#print " $text\n";
}
# Process start tags. First strip any attributes found
# in STRIP_ATTR out of $attrseq (but leave $attr untouched
# so original values still accessible). Next, if a method named
# start_TAG exists, call it. Otherwise, just copy the tag to the
# result.
sub start {
my ($self, $tagname, $attr, $attrseq, $text) = @_;
# strip out presentation attributes
foreach my $key (@STRIP_ATTR) {
if (exists $attr->{$key}) {
# strip from attrseq but leave in attr
@$attrseq = grep { $_ ne $key } @$attrseq;
}
}
$text = $self->make_text ($tagname, $attr, $attrseq);
my $meth = $self->can ("start_" . $tagname);
if ($meth) {
$self->$meth ($attr, $attrseq, $text);
}
elsif (grep { $_ eq $tagname } @STRIP_TAGS) {
return;
}
else {
$result .= $text;
}
}
# start tag function template
sub start_X {
my ($self, $attr, $attrseq, $text) = @_;
}
# Change bold to strong
sub start_b {
my ($self, $attr, $attrseq, $text) = @_;
$result .= '<strong>';
}
sub end_b {
my ($self, $tagname, $text) = @_;
$result .= '</strong>';
}
# Change italics to em
sub start_i {
my ($self, $attr, $attrseq, $text) = @_;
$result .= '<em>';
}
sub end_i {
my ($self, $tagname, $text) = @_;
$result .= '</em>';
}
# Restore height/width attributes to images
sub start_img {
my ($self, $attr, $attrseq, $text) = @_;
if (exists $attr->{'width'}) { push (@$attrseq, 'width'); }
if (exists $attr->{'height'}) { push (@$attrseq, 'height'); }
$text = $self->make_text ('img', $attr, $attrseq);
$result .= $text;
}
# Process <font>
sub start_font {
my ($self, $attr, $attrseq, $text) = @_;
my $size = ($attr->{'size'} or '');
my $replacement = $fontsub{$size};
if ($replacement) {
#print "size = $size, replace with $replacement\n";
push (@fontstack, close_tag ($replacement));
# write replacement tag
$result .= "<$replacement>";
} else {
push (@fontstack, '');
#print "no size attr\n";
}
}
# Process end tags. If a method named end_TAG exists, call it.
# Otherwise, just copy the tag to the result.
sub end {
my ($self, $tagname, $text) = @_;
my $meth = $self->can ("end_" . $tagname);
if ($meth) {
$self->$meth ($text);
}
else {
$result .= $text;
}
}
# end tag function template
sub end_X {
my ($self, $tagname, $text) = @_;
}
# Process </font>
sub end_font {
my ($self, $text) = @_;
my $replacement = pop (@fontstack);
#if ($replacement) { print "<$replacement>"; }
if ($replacement) { $result .= "<$replacement>"; }
}
# Everything else gets spit out verbatim
sub text {
my ($self, $text) = @_;
#print $text;
$result .= $text;
}
# Return the processed document and clear the storage buffer
# Must be called before parsing new doc
sub finish {
my $doc = $result;
$result = '';
return $doc;
}
# # Use same handler as text for these types
# *comment = \&text;
# *declaration = \&text;
# *process = \&text;
}; # end CleanupParser
# ------------- Main --------------------
foreach my $file (@ARGV) {
my $tmpfile = "$file.tmp";
open (OLD, "<$file") or die "open failed ($file): $!";
open (NEW, ">$tmpfile") or die "open failed ($tmpfile): $!";
@lines = <OLD>;
$content = join ('', @lines);
my $parser = CleanupParser->new(
#api_version => 3,
#default_h => [sub { print shift }, 'text'],
default_h => ['text', 'self,text'],
start_h => ['start', 'self,tagname,attr,attrseq,text'],
end_h => ['end', 'self,tagname,text'],
);
#$parser->xml_mode (1);
#$parser->parse_file ($file);
$parser->parse ($content);
$parser->eof;
$content = $parser->finish;
print NEW $content;
close (OLD) or die "close failed ($file): $!";
close (NEW) or die "close failed ($tmpfile): $!";
my $bakfile = "$file.cln.bak";
my $cnt = 0;
while (-e $bakfile) { $bakfile = "$file.$cnt.cln.bak"; $cnt++; }
#warn "WARNING: renaming disabled\n";
rename ($file, $bakfile) or die "move failed ($file -> $bakfile): $!";
rename ($tmpfile, $file) or die "move failed ($tmpfile -> $file): $!";
print "saved $file as $bakfile\n"
}
![]()