Fixes some common XHTML validation errors, to make a web page pass the W3C markup validator. Does this still work? I have no idea, I never build web pages by hand anymore.
#!/usr/bin/perl -cw # # Clean up a few common xhtml validation errors # # Arguments: provide any number of filenames on the command line # Each file will be cleaned, with original saved as file.bak[.num] { # Parser class to replace font tags with suitable values package CleanupParser; use base 'HTML::Parser'; # presentational attributes to remove from all tags # exception: <img> gets to keep width and height my @STRIP_ATTR = ('border', 'cellpadding', 'cellspacing', 'width', 'height', 'align', 'valign', 'hspace', 'vspace', 'target', 'onmouse', 'onmouseover', 'onclick', 'nowrap' ); # presentation tags to remove my @STRIP_TAGS = ( 'center', ); # Font substitution table my %fontsub = ( #font size => replacement tag '1' => '', '2' => '', # 2 is normal size, just delete '3' => 'h4', '4' => 'h3', '5' => 'h2', '6' => 'h1', '+1' => 'h3', '+2' => 'h2', '+3' => 'h1', '+4' => 'h1', '+5' => 'h1', '+6' => 'h1', '-1' => '', '-2' => '', '-3' => '', '-4' => '', '-5' => '', '-6' => '', ); # Store font closing tags on this stack my @fontstack = (); # Store processed document in this string my $result = ''; # Form the correct closing tag from an open tag w/ attributes sub close_tag { my ($tag) = @_; return '/' . (split (/\s+/, $tag))[0]; } # Create a text representation of a tag # Uses attrseq for attributes; values in attr but not attrseq won't # be copied. This allows stripping attributes while still saving # their value for later processing. sub make_text { my ($self, $tagname, $attr, $attrseq) = @_; my $text = "<$tagname"; # The / on /> closing marker for strict xhtml tags is not an # attribute but parser catches it as one. We remedy that here. foreach my $key (grep { $_ ne '/' } @$attrseq) { my $val = $attr->{$key}; $text .= " $key=\"$val\""; } $text .= exists $attr->{'/'} ? ' />' : '>'; #print " $text\n"; } # Process start tags. First strip any attributes found # in STRIP_ATTR out of $attrseq (but leave $attr untouched # so original values still accessible). Next, if a method named # start_TAG exists, call it. Otherwise, just copy the tag to the # result. sub start { my ($self, $tagname, $attr, $attrseq, $text) = @_; # strip out presentation attributes foreach my $key (@STRIP_ATTR) { if (exists $attr->{$key}) { # strip from attrseq but leave in attr @$attrseq = grep { $_ ne $key } @$attrseq; } } $text = $self->make_text ($tagname, $attr, $attrseq); my $meth = $self->can ("start_" . $tagname); if ($meth) { $self->$meth ($attr, $attrseq, $text); } elsif (grep { $_ eq $tagname } @STRIP_TAGS) { return; } else { $result .= $text; } } # start tag function template sub start_X { my ($self, $attr, $attrseq, $text) = @_; } # Change bold to strong sub start_b { my ($self, $attr, $attrseq, $text) = @_; $result .= '<strong>'; } sub end_b { my ($self, $tagname, $text) = @_; $result .= '</strong>'; } # Change italics to em sub start_i { my ($self, $attr, $attrseq, $text) = @_; $result .= '<em>'; } sub end_i { my ($self, $tagname, $text) = @_; $result .= '</em>'; } # Restore height/width attributes to images sub start_img { my ($self, $attr, $attrseq, $text) = @_; if (exists $attr->{'width'}) { push (@$attrseq, 'width'); } if (exists $attr->{'height'}) { push (@$attrseq, 'height'); } $text = $self->make_text ('img', $attr, $attrseq); $result .= $text; } # Process <font> sub start_font { my ($self, $attr, $attrseq, $text) = @_; my $size = ($attr->{'size'} or ''); my $replacement = $fontsub{$size}; if ($replacement) { #print "size = $size, replace with $replacement\n"; push (@fontstack, close_tag ($replacement)); # write replacement tag $result .= "<$replacement>"; } else { push (@fontstack, ''); #print "no size attr\n"; } } # Process end tags. If a method named end_TAG exists, call it. # Otherwise, just copy the tag to the result. sub end { my ($self, $tagname, $text) = @_; my $meth = $self->can ("end_" . $tagname); if ($meth) { $self->$meth ($text); } else { $result .= $text; } } # end tag function template sub end_X { my ($self, $tagname, $text) = @_; } # Process </font> sub end_font { my ($self, $text) = @_; my $replacement = pop (@fontstack); #if ($replacement) { print "<$replacement>"; } if ($replacement) { $result .= "<$replacement>"; } } # Everything else gets spit out verbatim sub text { my ($self, $text) = @_; #print $text; $result .= $text; } # Return the processed document and clear the storage buffer # Must be called before parsing new doc sub finish { my $doc = $result; $result = ''; return $doc; } # # Use same handler as text for these types # *comment = \&text; # *declaration = \&text; # *process = \&text; }; # end CleanupParser # ------------- Main -------------------- foreach my $file (@ARGV) { my $tmpfile = "$file.tmp"; open (OLD, "<$file") or die "open failed ($file): $!"; open (NEW, ">$tmpfile") or die "open failed ($tmpfile): $!"; @lines = <OLD>; $content = join ('', @lines); my $parser = CleanupParser->new( #api_version => 3, #default_h => [sub { print shift }, 'text'], default_h => ['text', 'self,text'], start_h => ['start', 'self,tagname,attr,attrseq,text'], end_h => ['end', 'self,tagname,text'], ); #$parser->xml_mode (1); #$parser->parse_file ($file); $parser->parse ($content); $parser->eof; $content = $parser->finish; print NEW $content; close (OLD) or die "close failed ($file): $!"; close (NEW) or die "close failed ($tmpfile): $!"; my $bakfile = "$file.cln.bak"; my $cnt = 0; while (-e $bakfile) { $bakfile = "$file.$cnt.cln.bak"; $cnt++; } #warn "WARNING: renaming disabled\n"; rename ($file, $bakfile) or die "move failed ($file -> $bakfile): $!"; rename ($tmpfile, $file) or die "move failed ($tmpfile -> $file): $!"; print "saved $file as $bakfile\n" }