This was written to strip local domains from hyperlinks in text files, e.g., www.geoffstratton.com/mylink.html will become /mylink.html.
#!/usr/bin/perl -cw
#
# Strip local domains from link urls.
# Each file will be modified in place and a unique backup saved.
use strict;
my $DEBUG = 0;
sub dbg_print {
if ($DEBUG > 0) { foreach my $str (@_) { print "$str\n"; } }
}
# Trim away leading/trailing whitespace
sub trim {
my @out = @_;
for (@out) { s/^\s+//; s/\s+$//; }
return wantarray ? @out : $out[0];
}
# Test whether a list contains a certain value
sub contains {
my ($val, @list) = @_;
return grep { $_ 0eq $val } @list;
}
# Parser class to replace font tags with suitable values
{
package LinkRelativizer;
use base 'HTML::Parser';
use 'URI';
# Can't figure out how to import these functions from main
# Alias them instead
*trim = \&main::trim;
*dbg_print = \&main::dbg_print;
*contains = \&main::contains;
# presentational attributes to remove
my @CHANGE_ATTR = ('href', 'src');
my @LOCAL_DOMAINS (
'put your domains here',
'another domain',
);
my $result = ''; # store processed document in this string
# Create a text representation of a tag.
# Uses attrseq for attributes; values in attr but not attrseq won't
# be copied. This allows stripping attributes while still saving
# their value for later processing.
sub make_text {
my ($self, $tagname, $attr, $attrseq) = @_;
my $text = "<$tagname";
# The / on /> closing marker for strict xhtml tags is not an
# attribute but parser catches it as one. We remedy that here.
foreach my $key (grep { $_ ne '/' } @$attrseq)
{
my $val = $attr->{$key};
$text .= " $key=\"$val\"";
}
$text .= exists $attr->{'/'} ? ' />' : '>';
#print " $text\n";
}
# Process start tags.
sub start {
my ($self, $tagname, $attr, $attrseq, $text) = @_;
foreach my $key (@CHANGE_ATTR) {
if (not exists $attr->{$key}) { next; }
my $link = $attr->{$key};
my $domain = 'empty';
if ($link =~ m|^http://([^/]+)|) { $domain = $1; }
if (not contains ($domain, @LOCAL_DOMAINS)) {
# this isn't the domain you're looking for...
next;
}
$link =~ s{^http://[^/]+(/.*)}{$1}i;
$attr->{$key} = $link;
$text = $self->make_text ($tagname, $attr, $attrseq);
}
dbg_print ("tag = $text");
$result .= $text;
}
# Everything else gets spit out verbatim
sub text {
my ($self, $text, $event) = @_;
#my $foo = trim ($text);
#if ($foo) { print "event $event: $foo\n" }
$result .= $text;
dbg_print ("text = $text") if $text =~ /\S/;
}
# Return the processed document and clear the storage buffer
# Must be called before parsing new doc
sub finish {
my $doc = $result;
# reset package vars
$result = '';
return $doc;
}
# # Use same handler as text for these types
# *comment = \&text;
# *declaration = \&text;
# *process = \&text;
}; # end LinkRelativizer
# ------------- Main --------------------
my $save_backup = 1;
my $rename_file = 1;
foreach my $arg (@ARGV) {
if ($arg eq '-f') {
$save_backup = 0;
next;
} elsif ($arg eq '-r') {
$rename_file = 0;
$save_backup = 0;
next;
}
my $file = $arg;
my $tmpfile = "$file.tmp";
open (OLD, "<$file") or die "open failed ($file): $!";
open (NEW, ">$tmpfile") or die "open failed ($tmpfile): $!";
my @lines = <OLD>;
my $content = join ('', @lines);
my $parser = LinkRelativizer->new(
#api_version => 3,
#default_h => [sub { print shift }, 'text'],
default_h => ['text', 'self,text,event'],
start_h => ['start', 'self,tagname,attr,attrseq,text'],
# end_h => ['end', 'self,tagname,text'],
);
#$parser->xml_mode (1);
#$parser->parse_file ($file);
$parser->parse ($content);
$parser->eof;
$content = $parser->finish;
#font: 8pt 'MS Comic Sans','Arial',sans-serif;
print NEW $content;
close (OLD) or die "close failed ($file): $!";
close (NEW) or die "close failed ($tmpfile): $!";
if ($save_backup) {
my $bakfile = "$file.bak";
my $cnt = 0;
while (-e $bakfile) { $bakfile = "$file.bak.$cnt"; $cnt++; }
rename ($file, $bakfile) or die "move failed ($file -> $bakfile): $!";
print "saved $file as $bakfile\n"
}
if ($rename_file) {
rename ($tmpfile, $file) or die "move failed ($tmpfile -> $file): $!";
} else {
warn "WARNING: rename disabled\n";
}
}
![]()