Perl XHTML Validation

Fixes some common XHTML validation errors, to make a web page pass the W3C markup validator. Does this still work? I have no idea, I never build web pages by hand anymore.

#!/usr/bin/perl -cw
#
# Clean up a few common xhtml validation errors
#
# Arguments: provide any number of filenames on the command line
# Each file will be cleaned, with original saved as file.bak[.num]

{
    # Parser class to replace font tags with suitable values
    package CleanupParser;
    use base 'HTML::Parser';

	# presentational attributes to remove from all tags
	# exception: <img> gets to keep width and height
	my @STRIP_ATTR = ('border', 'cellpadding', 'cellspacing',
		'width', 'height', 'align', 'valign',  'hspace', 'vspace',
		'target', 'onmouse', 'onmouseover', 'onclick', 'nowrap'
	);
	# presentation tags to remove
	my @STRIP_TAGS = (
		'center',
	);

	# Font substitution table
	my %fontsub = (
	    #font size => replacement tag
	    '1' => '',
	    '2' => '',  # 2 is normal size, just delete
	    '3' => 'h4',
	    '4' => 'h3',
	    '5' => 'h2',
	    '6' => 'h1',
	    '+1' => 'h3',
	    '+2' => 'h2',
	    '+3' => 'h1',
	    '+4' => 'h1',
	    '+5' => 'h1',
	    '+6' => 'h1',
	    '-1' => '',
	    '-2' => '',
	    '-3' => '',
	    '-4' => '',
	    '-5' => '',
	    '-6' => '',
	);
    # Store font closing tags on this stack
    my @fontstack = ();
    # Store processed document in this string
    my $result = '';

    # Form the correct closing tag from an open tag w/ attributes
    sub close_tag {
        my ($tag) = @_;
        return '/' . (split (/\s+/, $tag))[0];
    }

    # Create a text representation of a tag
    # Uses attrseq for attributes; values in attr but not attrseq won't
    # be copied.  This allows stripping attributes while still saving
    # their value for later processing.
    sub make_text {
        my ($self, $tagname, $attr, $attrseq) = @_;

        my $text = "<$tagname";
		# The / on /> closing marker for strict xhtml tags is not an
		# attribute but parser catches it as one.  We remedy that here.
        foreach my $key (grep { $_ ne '/' } @$attrseq) {
            my $val = $attr->{$key};
            $text .= " $key=\"$val\"";
        }
	$text .= exists $attr->{'/'} ? ' />' : '>';
        #print "    $text\n";
    }

    # Process start tags.  First strip any attributes found
	# in STRIP_ATTR out of $attrseq (but leave $attr untouched
	# so original values still accessible).  Next, if a method named
	# start_TAG exists, call it. Otherwise, just copy the tag to the
	# result.
    sub start {
        my ($self, $tagname, $attr, $attrseq, $text) = @_;

        # strip out presentation attributes
        foreach my $key (@STRIP_ATTR) {
            if (exists $attr->{$key}) {
				# strip from attrseq but leave in attr
                @$attrseq = grep { $_ ne $key } @$attrseq;
            }
        }
        $text = $self->make_text ($tagname, $attr, $attrseq);

		my $meth = $self->can ("start_" . $tagname);
		if ($meth) {
			$self->$meth ($attr, $attrseq, $text);
		}
		elsif (grep { $_ eq $tagname } @STRIP_TAGS) {
			return;
		}
		else {
			$result .= $text;
		}
    }

    # start tag function template
    sub start_X {
        my ($self, $attr, $attrseq, $text) = @_;
    }

	# Change bold to strong
	sub start_b {
        my ($self, $attr, $attrseq, $text) = @_;
		$result .= '<strong>';
	}
	sub end_b 	{
        my ($self, $tagname, $text) = @_;
		$result .= '</strong>';
	}

	# Change italics to em
	sub start_i {
        my ($self, $attr, $attrseq, $text) = @_;
		$result .= '<em>';
	}
	sub end_i {
        my ($self, $tagname, $text) = @_;
		$result .= '</em>';
	}

	# Restore height/width attributes to images
	sub start_img {
        my ($self, $attr, $attrseq, $text) = @_;
		if (exists $attr->{'width'}) { push (@$attrseq, 'width'); }
		if (exists $attr->{'height'}) { push (@$attrseq, 'height'); }
		$text = $self->make_text ('img', $attr, $attrseq);
		$result .= $text;
	}

	# Process <font>
	sub start_font {
        my ($self, $attr, $attrseq, $text) = @_;

        my $size = ($attr->{'size'} or '');
        my $replacement = $fontsub{$size};
        if ($replacement) {
            #print "size = $size, replace with $replacement\n";
            push (@fontstack, close_tag ($replacement));
			# write replacement tag
            $result .= "<$replacement>";
        } else {
            push (@fontstack, '');
            #print "no size attr\n";
        }
	}

    # Process end tags.  If a method named end_TAG exists, call it.
	# Otherwise, just copy the tag to the result.
    sub end {
        my ($self, $tagname, $text) = @_;

		my $meth = $self->can ("end_" . $tagname);
		if ($meth) {
			$self->$meth ($text);
		}
		else {
			$result .= $text;
		}
	}

	# end tag function template
    sub end_X {
        my ($self, $tagname, $text) = @_;
    }

    # Process </font>
    sub end_font {
        my ($self, $text) = @_;
        my $replacement = pop (@fontstack);
        #if ($replacement)  { print "<$replacement>"; }
        if ($replacement)  { $result .= "<$replacement>"; }
    }

    # Everything else gets spit out verbatim
    sub text {
        my ($self, $text) = @_;
        #print $text;
        $result .= $text;
    }

    # Return the processed document and clear the storage buffer
    # Must be called before parsing new doc
    sub finish {
        my $doc = $result;
        $result = '';
        return $doc;
    }
#    # Use same handler as text for these types
#    *comment     = \&text;
#    *declaration = \&text;
#    *process     = \&text;

};   # end CleanupParser


# ------------- Main --------------------

foreach my $file (@ARGV) {
	my $tmpfile = "$file.tmp";
	open (OLD, "<$file") or die "open failed ($file): $!";
	open (NEW, ">$tmpfile") or die "open failed ($tmpfile): $!";

	@lines = <OLD>;
	$content = join ('', @lines);

    my $parser = CleanupParser->new(
        #api_version => 3,
        #default_h => [sub { print shift }, 'text'],
        default_h => ['text', 'self,text'],
        start_h => ['start', 'self,tagname,attr,attrseq,text'],
        end_h => ['end', 'self,tagname,text'],
    );
    #$parser->xml_mode (1);
    #$parser->parse_file ($file);
    $parser->parse ($content);
    $parser->eof;
    $content = $parser->finish;

    print NEW $content;
    close (OLD) or die "close failed ($file): $!";
    close (NEW) or die "close failed ($tmpfile): $!";

    my $bakfile = "$file.cln.bak";
    my $cnt = 0;
    while (-e $bakfile) { $bakfile = "$file.$cnt.cln.bak"; $cnt++; }

    #warn "WARNING: renaming disabled\n";
    rename ($file, $bakfile)    or die "move failed ($file -> $bakfile): $!";
    rename ($tmpfile, $file)    or die "move failed ($tmpfile -> $file): $!";
    print "saved $file as $bakfile\n"
}

Loading

Leave a Reply

Your email address will not be published. Required fields are marked *