This was written to strip local domains from hyperlinks in text files, e.g., www.geoffstratton.com/mylink.html will become /mylink.html.
#!/usr/bin/perl -cw # # Strip local domains from link urls. # Each file will be modified in place and a unique backup saved. use strict; my $DEBUG = 0; sub dbg_print { if ($DEBUG > 0) { foreach my $str (@_) { print "$str\n"; } } } # Trim away leading/trailing whitespace sub trim { my @out = @_; for (@out) { s/^\s+//; s/\s+$//; } return wantarray ? @out : $out[0]; } # Test whether a list contains a certain value sub contains { my ($val, @list) = @_; return grep { $_ 0eq $val } @list; } # Parser class to replace font tags with suitable values { package LinkRelativizer; use base 'HTML::Parser'; use 'URI'; # Can't figure out how to import these functions from main # Alias them instead *trim = \&main::trim; *dbg_print = \&main::dbg_print; *contains = \&main::contains; # presentational attributes to remove my @CHANGE_ATTR = ('href', 'src'); my @LOCAL_DOMAINS ( 'put your domains here', 'another domain', ); my $result = ''; # store processed document in this string # Create a text representation of a tag. # Uses attrseq for attributes; values in attr but not attrseq won't # be copied. This allows stripping attributes while still saving # their value for later processing. sub make_text { my ($self, $tagname, $attr, $attrseq) = @_; my $text = "<$tagname"; # The / on /> closing marker for strict xhtml tags is not an # attribute but parser catches it as one. We remedy that here. foreach my $key (grep { $_ ne '/' } @$attrseq) { my $val = $attr->{$key}; $text .= " $key=\"$val\""; } $text .= exists $attr->{'/'} ? ' />' : '>'; #print " $text\n"; } # Process start tags. sub start { my ($self, $tagname, $attr, $attrseq, $text) = @_; foreach my $key (@CHANGE_ATTR) { if (not exists $attr->{$key}) { next; } my $link = $attr->{$key}; my $domain = 'empty'; if ($link =~ m|^http://([^/]+)|) { $domain = $1; } if (not contains ($domain, @LOCAL_DOMAINS)) { # this isn't the domain you're looking for... next; } $link =~ s{^http://[^/]+(/.*)}{$1}i; $attr->{$key} = $link; $text = $self->make_text ($tagname, $attr, $attrseq); } dbg_print ("tag = $text"); $result .= $text; } # Everything else gets spit out verbatim sub text { my ($self, $text, $event) = @_; #my $foo = trim ($text); #if ($foo) { print "event $event: $foo\n" } $result .= $text; dbg_print ("text = $text") if $text =~ /\S/; } # Return the processed document and clear the storage buffer # Must be called before parsing new doc sub finish { my $doc = $result; # reset package vars $result = ''; return $doc; } # # Use same handler as text for these types # *comment = \&text; # *declaration = \&text; # *process = \&text; }; # end LinkRelativizer # ------------- Main -------------------- my $save_backup = 1; my $rename_file = 1; foreach my $arg (@ARGV) { if ($arg eq '-f') { $save_backup = 0; next; } elsif ($arg eq '-r') { $rename_file = 0; $save_backup = 0; next; } my $file = $arg; my $tmpfile = "$file.tmp"; open (OLD, "<$file") or die "open failed ($file): $!"; open (NEW, ">$tmpfile") or die "open failed ($tmpfile): $!"; my @lines = <OLD>; my $content = join ('', @lines); my $parser = LinkRelativizer->new( #api_version => 3, #default_h => [sub { print shift }, 'text'], default_h => ['text', 'self,text,event'], start_h => ['start', 'self,tagname,attr,attrseq,text'], # end_h => ['end', 'self,tagname,text'], ); #$parser->xml_mode (1); #$parser->parse_file ($file); $parser->parse ($content); $parser->eof; $content = $parser->finish; #font: 8pt 'MS Comic Sans','Arial',sans-serif; print NEW $content; close (OLD) or die "close failed ($file): $!"; close (NEW) or die "close failed ($tmpfile): $!"; if ($save_backup) { my $bakfile = "$file.bak"; my $cnt = 0; while (-e $bakfile) { $bakfile = "$file.bak.$cnt"; $cnt++; } rename ($file, $bakfile) or die "move failed ($file -> $bakfile): $!"; print "saved $file as $bakfile\n" } if ($rename_file) { rename ($tmpfile, $file) or die "move failed ($tmpfile -> $file): $!"; } else { warn "WARNING: rename disabled\n"; } }