check html file size

Xah Lee xah at xahlee.org
Tue Oct 4 20:44:02 EDT 2005


would anyone like to translate the following perl script to Python or
Scheme (scsh)?

the file takes a inpath, and report all html files in it above certain
size. (counting inline images)
also print a sorted report of html files and their size.

(a copy of the script is here:
http://xahlee.org/_scripts/check_file_size.pl
)

 Xah
 xah at xahlee.orghttp://xahlee.org/


# perl

# Tue Oct  4 14:36:48 PDT 2005
# given a dir, report all html file's size. (counting inline images)
# XahLee.org

use Data::Dumper;
use File::Find;
use File::Basename;

$inpath = '/Users/t/web/mydirectory/';
$sizeLimit = 800 * 1000;

# $inpath = $ARGV[0]; # should give a full path; else the
$File::Find::dir won't give full path.
while ($inpath =~ m@^(.+)/$@) { $inpath = $1;} # get rid of trailing
slash

die "dir $inpath doesn't exist! $!" unless -e $inpath;


##################################################
# subroutines


# getInlineImg($file_full_path) returns a array that is a list of
inline images. For example, it may return ('xx.jpg','../image.png')
sub getInlineImg ($) { $full_file_name= $_[0];
	@linx =(); open (FF, "<$full_file_name") or die "error: can not open
$full_file_name $!";
	while (<FF>) { @txt_segs = split(m/img/, $_); shift @txt_segs;
		for $lin (@txt_segs) { if ($lin =~ m@ src\s*=\s*\"([^\"]+)\"@i) {
push @linx, $1; }}
	} close FF;
	return @linx;
}

# linkFullPath($dir,$locallink) returns a string that is the full path
to the local link. For example,
linkFullPath('/Users/t/public_html/a/b', '../image/t.png') returns
'Users/t/public_html/a/image/t.png'. The returned result will not
contain double slash or '../' string.
sub linkFullPath($$){ $result=$_[0] . $_[1]; while ($result =~
s@\/\/@\/@) {}; while ($result =~ s@/[^\/]+\/\.\.@@) {}; return
$result;}


# listLocalLinks($html_file_full_path) returns a array where each
element is a full path of local links in the html.
sub listLocalLinks($) {
	my $htmlfile= $_[0];

	my ($name, $dir, $suffix) = fileparse($htmlfile, ('\.html') );
	my @aa = getlinks($htmlfile);
	@aa = grep(!m/\#/, @aa);
  @aa = grep (!m/^mailto:/, @aa);
  @aa = grep (!m/^http:/, @aa);

	my @linkedFiles=();
	foreach my $lix (@aa) { push @linkedFiles, linkFullPath($dir,$lix);}
	return @linkedFiles;
}


# listInlineImg($html_file_full_path) returns a array where each
element is a full path to inline images in the html.
sub listInlineImg($) {
	my $htmlfile= $_[0];

	my ($name, $dir, $suffix) = fileparse($htmlfile, ('\.html') );
	my @aa = getInlineImg($htmlfile);

	my @result=();
	foreach my $ele (@aa) { push @result, linkFullPath($dir,$ele);}
	return @result;
}

##################################################
sub checkLink {
    if (
        -T $File::Find::name
        && $File::Find::name =~ m@\.html$@
    ) {
        $total= -s $File::Find::name;
        @h2 = listInlineImg($File::Find::name);
        for my $ln (@h2) {$total += -s $ln;};
        if ( $total > $sizeLimit) {print "problem: file:
$File::Find::name, size: $total\n";}

        push (@result, [$total, $File::Find::name]);
	};
}

find(\&checkLink, $inpath);

@result = sort { $b->[0] <=> $a->[0]} @result;

print Dumper(\@result);
print "done reporting. (any file above size are printed above.)";

__END__




More information about the Python-list mailing list