#!/usr/bin/perl # LEGAL WARNING: # # This software is provided on an "AS IS", basis, # without warranty of any kind, including without # limitation the warranties of merchantability, fitness for # a particular purpose and non-infringement. The entire # risk as to the quality and performance of the Software is # borne by you. Should the Software prove defective, you # and not the author assume the entire cost of any service # and repair. # # # The source documentation of this software is maintained by the program esd2html. # esd2html stands for Embedded Source Documentation to(2) HTML. # esd2html is GNU GPL Perl program available from # # http://www.isys.hu/c/verhas/progs/perl/esd2html # # To create the online HTML files you also need the SDD file along with this source file. # # The original location of this program is # # http://www.isys.hu/c/verhas/progs/perl/webmirror # # If you received this program from a different location visit the URL above to check # for newer version. # # \today # =pod =H mirror.pl =toc =cut # $version = '2.45 pre-release'; @include = (); # domains to include @exclude = (); # domains to exclude $maximal_deepness = 3; # how long should we follow links $maximal_size = 1024*1024; # max 1 MB $pictures = 1; # download pictures $allpictures = 0; # get all pictures even those which are out of the included domain $allframes = 0; # include html pages that are part of a frame even if they are out of the included domain $picpriority = 0; # treat pictures as ordinary page (do not rush for them) $create_leafs = 1; # create redirect files for pages that are too deep $regexps_are_perl = 0; # patterns are normal, * is wild car and that is all $use_full_url = 1; # use full URL on GET even without proxy if( $#ARGV == -1 ){ &print_help; exit; } while( $cmdlinearg = shift @ARGV ){ if( $cmdlinearg eq '-f' ){ &process_command_file(shift); } elsif( $cmdlinearg eq '-s' ){ $log_stdout_on = 0; } elsif( $cmdlinearg eq '-h' ){ &print_help; exit; } else{ print STDERR "invalid option $cmdlinearg\n"; } } $current_size = 0; #downloaded bytes so far while( $page = webpage->next_page ){ last if $maximal_size && $current_size >= $maximal_size; next if $page->fetch_state eq 'SAVED'; my $page_level = $page->level; if( $maximal_deepness && $page_level > $maximal_deepness ){ $url = $page->url; $page->{'HEADER'}->{'content-type'} = 'text/html'; $page->file; $page->text_leaf; $page->create_file if $create_leafs; $page->destroy; next; } redirected: $page->calculate_auth; authretry: $page->get; my $status = $page->{'STATE'}; if( $status == 502 || # Service temporarily overloaded $status == 503 || # Gateway timeout $status == 603 # Timed out ){ $page->fetch_state('FRESH'); $page->schedule; next; } if( $status == 401 && # failed password $page->auth_type eq 'basic' && # authentication is basic (we can not do anything else recently) defined($page->{'AUTHLIST'}) && # we have username:password list defined $#{$page->{'AUTHLIST'}} >= 0 ){ # and have at least one members left to try $page->fetch_state('SCHEDULED'); # 'sub get' is lasy and said it was already retrieved, # but it is not the way like 'sub get' thinks. 'sub get' should try again! goto authretry; # try again with the next username:password # do you know that all listed usernames and password will be tried for # each page retrieved in the domain? It does not learn from previous # retrievals. It tries and tries all passwords, so do not be crasy and define only # one auth for a domain! } if( ( $status >= 400 && $status <= 404 ) || $status == 501 || ( $status >= 600 && $status <= 602 ) ){ $page->text_badstate; $page->create_file if $create_leafs; $page->destroy; next; } if( $status == 301 || $status == 302 ){ my $URL = $page->{'HEADER'}->{'location'}; my $file; my $redir; my $follow_redirect; if( $follow_redirect = &can_get( $URL ) ){ $redir = new webpage($URL); webpage->log("REDIRECT to $URL"); $redir->schedule; $file = $page->relate($redir); }else{ $file = $URL; }# this is an absolute URL $page->text_redirect( $file ); $page->create_file; $page->destroy; next unless $follow_redirect; $page = $redir; goto redirected; } if( $status >= 200 && $status < 300 ){ $current_size += $page->length; if( $maximal_size && $current_size > $maximal_size ){ webpage->log('QUOTA retrieving '.$page->url.' process exceeded.'); webpage->log('QUITting download loop.'); last; # quit from download loop } if( $page->content_type eq 'text/html' ){ $page->split_html; delete $page->{'CONTENT'}; # we delete and the rebuild the content for $tag ( @{$page->{'SPLIT'}} ){ next if $tag->{'TYPE'} eq 'TEXT'; my $tagtype = $tag->{'CONTENT'}->[0]; my $tags = $tag->{'CONTENT'}->[1]; # if( ('a' eq $tagtype || 'area' eq $tagtype) && defined($tags->{'href'}) ){ my $href = $page->urel2abs($tags->{'href'}); my $hrefa = $href; # url with optional anchor $href =~ s/\#([^\/]*)$//; # delete anchor my $anchor = $1; if( &must_get( $href ) ){ my $tpage = new webpage($href); $tpage->level($page_level + 1); $tpage->schedule; $tags->{'href'} = $page->relate($tpage); # point to the copied url $tags->{'href'} .= '#' . $anchor if $anchor; }else{ $tags->{'href'} = $hrefa; # point absolute to the external, not copied url } } # elsif( 'meta' eq $tagtype && lc($tags->{'http-equiv'}) eq 'refresh' ){ next unless $tags->{'content'} =~ /(\d+)\;\s*url\=(.*)/i; my $delay = $1; my $href = $page->urel2abs($2); my $hrefa = $href; # href with optional anchor $href =~ s/\#[^\/]*$//; # delete anchor if( &must_get( $href ) ){ my $tpage = new webpage($hrefa); $tpage->level($page_level + 1); $tpage->schedule; $tags->{'content'} = "$delay; URL=" . $page->relate($tpage); # point to the copied url }else{ $tags->{'content'} = "$delay; URL=$href"; # point absolute to the external, not copied url } } # elsif( 'img' eq $tagtype && $pictures && defined($tags->{'src'}) ){ my $href = $page->urel2abs($tags->{'src'}); my $hrefa = $href; # href with optional anchor $href =~ s/\#[^\/]*$//; # delete anchor if( ( $allpictures && &can_get($href) ) || &must_get( $href ) ){ my $tpage = new webpage($hrefa); $tpage->level($page_level); $tpage->schedule; $tags->{'src'} = $page->relate($tpage); } } # elsif( ('frame' eq $tagtype || 'iframe' eq $tagtype) && defined($tags->{'src'}) ){ my $href = $page->urel2abs($tags->{'src'}); my $hrefa = $href; # href with optional anchor $href =~ s/\#([^\/]*)$//; # delete anchor my $anchor = $1; if( ( $allframes && &can_get($href) ) || &must_get( $href ) ){ my $tpage = new webpage($hrefa); $tpage->level($page_level); $tpage->schedule; $tags->{'src'} = $page->relate($tpage); $tags->{'src'} .= '#' . $anchor if $anchor; } } #
# we do not follow form actions, but the url should be converted to absolute elsif( 'form' eq $tagtype && defined($tags->{'action'}) ){ my $href = $page->urel2abs($tags->{'action'}); $tags->{'action'} = $href; # point absolute to the external, not copied url } } $page->rebuild_content; } $page->create_file; $url_state{$page->url} = 'DONE'; } } exit; =pod =section can_get =H can we get a page? This function gets an URL and decides if the system can get it or not. Usage: =verbatim &can_get($url) =noverbatim The function returns true if the url can be retrieved using http. If this is a mailto: ftp://, https:// or other protocol, then it return zero. =cut sub can_get { my $url = shift; return 1 unless $url =~ m{^\w+:}; # relative url is fine return 1 if $url =~ m{http://}; # http is fine return 0; } =pod =section must_get =H should we get a page? This function gets an URL and decides if the system must get it or not. Usage: =verbatim &must_get($url) =noverbatim The function returns true if the url is to be retrieved by the include and exclude rules. It does not check page level or maximal retrieval size. All pages that T says can not be got T returns zero. =cut sub must_get { my $url = shift; return 0 unless &can_get($url); $url = webpage->normalize_url($url); my $pattern; my $y=0; for $pattern ( @include ){ if( $url =~ /$pattern/ ){ $y = 1; last; } } return 0 unless $y; for $pattern ( @exclude ){ return 0 if $url =~ /$pattern/; } return 1; } =pod =section print_help =H print help on the screen =cut sub print_help { print <; $/ = $oldsep; close STDIN; # no reason to keep it open }else{ open(F,"<$command_file") or die "Can not open command file $command_file"; my $oldsep = $/; undef $/; $lines = ; $/ = $oldsep; close F; } while( $lines =~ /^(config\s+.*)$/m ){ my $includef = $1; $include = quotemeta $includef; $includef =~ s/^config\s*//; open(F,"<$includef") or die "Can not open command file $includef"; my $oldsep = $/; undef $/; my $inclines = ; $/ = $oldsep; close F; $lines =~ s/$include/$inclines/mg; } my @lines = split(/\n+/ , $lines); $lines = ''; for( @lines ){ my $command; my $parameter; chomp; s/^\s*//;s/\s*$//; # delete leading and trailing spaces next if /^\s*$/;# ignore empty lines next if /^\s*\#/; #ignore comment lines if( /(\w+)\s+(.*)/ ){ $command = $1; $parameter = $2; } elsif( /(\w+)/ ){ $command = $1; $parameter = ''; } else{ print STDERR "line '$_' ignored. Syntax error.\n"; next; } if( $command eq 'log' ){ $webpage::log_file = $parameter; next; } if( $command eq 'map' ){ $webpage::map_method = $parameter; next; } if( $command eq 'start' ){ $parameter = 'http://' . $parameter unless $parameter =~ m{^http://}; my $page = new webpage( $parameter ); $page->schedule; $page->level(1); next; } if( $command eq 'include' ){ $parameter = 'http://' . $parameter unless $parameter =~ m{^http://}; $parameter = ®exp_it($parameter); push @include , $parameter; next; } if( $command eq 'regexp' ){ if( $parameter eq 'perl' ){ $regexps_are_perl = 1; next; }elsif( $parameter eq 'normal' ){ $regexps_are_perl = 0; next; } } if( $command eq 'exclude' ){ $parameter = 'http://' . $parameter unless $parameter =~ m{^http://}; $parameter = ®exp_it($parameter); push @exclude , $parameter; next; } if( $command eq 'level' ){ if( $parameter eq 'inf' ){ $maximal_deepness = 0; }else{ $maximal_deepness = $parameter; } next; } if( $command eq 'directget' ){ if( $parameter eq 'partial' ){ $use_full_url = 0; } if( $parameter eq 'full' ){ $use_full_url = 1; } next; } if( $command eq 'size' ){ if( $parameter eq 'inf' ){ $maximal_size = 0; next; } $maximal_size = &prep_postf($parameter); next; } if( $command eq 'directory' ){ webpage->cwd($parameter); next; } if( $command eq 'allpictures' ){ $allpictures = 1; next; } if( $command eq 'allframes' ){ $allframes = 1; next; } # try this first without authentication # noauth http://*.mydomain.com if( $command eq 'noauth' ){ $parameter = ®exp_it($parameter); push @webpage::noauth_domain , { 'DOMAIN' => $domain , 'REALM' => '.*' , 'AUTHS' => '' , # this means w/o authentication }; next; } # auth http://www.mydomain.com/* realm username:password if( $command eq 'auth' ){ $parameter =~ /(\S+)\s*(.*)$/; my $domain = ®exp_it($1); $parameter = $2; $parameter =~ /(\S+)\s*(.*)$/; my $realm = ®exp_it($1); $parameter = webpage->base64($2); push @webpage::auth_domain , { 'DOMAIN' => $domain , 'REALM' => $realm , 'AUTH' => $parameter , }; next; } if( $command eq 'pictures' ){ $pictures = 1; if( $parameter eq 'first' ){ $picpriority = 1; # schedule img url to the start of the schedule list }else{ $picpriority = 0; # schedule img url to the end of the schedule list } # just like html pages next; } if( $command eq 'nopictures' ){ $pictures = 0; $picpriority = 0; # useless but set it to default next; } if( $command eq 'leafs' ){ $create_leafs = 1; next; } if( $command eq 'noleafs' ){ $create_leafs = 0; next; } if( $command eq 'proxy' ){ if( $parameter =~ /(.+)\s+(.+)/ ){ my $proxy = $1; my $domain = $2; $domain = ®exp_it($domain); webpage->define_proxy( $proxy , $domain ); }else{ webpage->define_proxy( $proxy ); } next; } if( $command eq 'interface' ){ if( $parameter =~ /(.+)\s+(.+)/ ){ my $if = $1; my $pattern = ®exp_it($2); webpage->define_interface( $if , $pattern ); }else{ webpage->define_interface( $parameter , '.*' ); } next; } if( $command eq 'pagesizelimit' ){ if( $parameter eq 'inf' ){ $webpage::pagesizelimit = 0; next; } $webpage::pagesizelimit = &prep_postf($parameter); next; } if( $command eq 'unbelieve' ){ if( lc($parameter) eq 'content-length' ){ $webpage::cl_believe = 0; next; } } if( $command eq 'believe' ){ if( lc($parameter) eq 'content-length' ){ $webpage::cl_believe = 1; next; } } if( $command eq 'agent' ){ $webpage::USERAGENT = $parameter; next; } if( $command eq 'cookie' || $command eq 'cookies' ){ if( $parameter eq 'no' ){ $webpage::do_cookies = 0; next; } if( $parameter eq 'yes' ){ $webpage::do_cookies = 1; next; } if( $parameter eq '3dots' ){ $webpage::count_cookie_dots = 2; next; } if( $parameter eq 'dots' ){ $webpage::count_cookie_dots = 1; next; } if( $parameter eq 'nodots' ){ $webpage::count_cookie_dots = 0; next; } if( $parameter eq 'domain' ){ $webpage::care_cookie_domain = 1; next; } if( $parameter eq 'nodomain' ){ $webpage::care_cookie_domain = 0; next; } if( $parameter =~ /save\s+session/ ){ $webpage::save_session_cookies = 1; next; } if( $parameter =~ /don\'?t\s+save\s+session/ ){ $webpage::save_session_cookies = 0; next; } if( $parameter =~ /backup\s+(\d+)/ ){ $webpage::cookie_file_backup_nr = $1; next; } if( $parameter =~ /file\s+(.*)/ ){ $webpage::cookie_file = $1; webpage->load_cookies($webpage::cookie_file); next; } if( $parameter =~ /load\s+(.*)/ ){ webpage->load_cookies($1); next; } } print STDERR "line '$_' ignored\n"; } } =pod =section prep_postf =H Prepare a size parameter Usage: =verbatim $p = &prep_postf($p); =noverbatim Checks if there is any T or T postfix after the size parameter, and converts the numeric value. As an example it returns T<1024> for T<1K>, or T<1024*1024> for T<1M>. =cut sub prep_postf { my $p = shift; if( $p =~ s/M$// ){ $p *= 1024*1024 } elsif( $p =~ s/K$// ){ $p *= 1024 } return $p; } =pod =section regexp_it =H Converts a pattern to a regular expression Usage: =verbatim $regexp = &regexp_it("wildchar pattern*"); =noverbatim Domains in the config file can be given as wild charactered strings. This routine converts these strings to Perl regular expression. This is nothing else then T all the string and convert the wild character T<*> toT<.*> Some user may like to specify Perl regular expressions saying =verbatim regexp perl =noverbatim in the RDF file. The effect of that command is that the global variable T<$regexps_are_perl> is set. When that variable is true this function becomes transparent and returns the strings as they came is. =cut sub regexp_it { my $string = shift; return $string if $regexps_are_perl; $string = quotemeta $string; $string =~ s/\\\*/.*/g; #make * the joker character return $string; } =pod =section webpage =H PACKAGE WEBPAGE =abstract This package/class implements the web page functionality. A web page is stored in the memory, and is usually prepared for download, downloaded, prepared to be saved and saved. =end This package/class implements the web page functionality. A web page is stored in the memory, and is usually prepared for download, downloaded, prepared to be saved and saved. A T object is a page which is downloaded from a URL. One page object has one URL and one URL is assigned to a page object. Sometimes the same URL returns different pages, but somewhere implementation has to stop. There are many things that the program can do with such an objects. See the methods. The only thing which is not really object oriented is the cookie handling. Cookies are stored in the array T<@cookies> which belong to the package T. But you can not create different cookie realms for different downloads. There is only a single T<@cookies> array. =cut package webpage; BEGIN { $umask = 0777; $USERAGENT = 'my useragent 1.0'; $map_method = 'simple'; # by default we do not create sub directories like com/digital/www/80/... $map_counter = 1; # file name counter when mapping is 'flat' $object_hash = {}; # containing all objects created @schedule_list = (); %host_ip = (); # store the ip addresses for hosts already accessed %ip_host = (); # and store the reverse @proxies = (); # proxies to use $save_directory = '.';# where to store the result $log_file = ''; # where to write the log $log_opened = 'NOT YET'; $log_stderr_on = 0; # we do not send log messages to stderr by default $log_stdout_on = 1; # we send log messages to stdout by default $pagesizelimit = 0; # no page size limit by default $cl_believe = 1; # do believe reported content length @auth_domain = (); # list of domains that authentication was defined @noauth_domain = (); # list of domains that should be first tried w/o authentication $interfaces_defined = 0; my $hostname = `hostname`; chomp $hostname; @interfaces = ($hostname); %interfaces = ( $hostname => '.*' ); @mdays = ( '31', '28', '31', '30', '31', '30', '31', '31', '30', '31', '30', '31' ); $do_cookies = 1; # process cookies @cookies = (); # each item is a pointer to a hash of { NAME => ?? , VALUE => ??, # EXPIRES => ??, PATH => ?? , # DOMAIN => ??, SECURE => 0|1 } $count_cookie_dots = 1; # 0 = do not care how many dots there are # 1 = request at least two dots (default) # 2 = request two or three dots (according to the spec, but it is crasy! *.mydom.hu ??) $care_cookie_domain = 1;# check that the indicated domain fits the sender (i.e. www.cnn.com should not # send a cookie with the domain .bbc.com) $save_session_cookies = 0; # do not save the cookies that have no expiration time $cookie_file_backup_nr = 100; # how many backup files to keep from old cookie files with names: # cookie.txt.00 ,cookie.txt.01 ,cookie.txt.02 ,cookie.txt.03 ... $cookie_file = ''; # where to save the cookies %mime_extension = ( 'application/x-gzip' => ['gz'], 'application/x-compress' => ['Z'], 'applicarion/x-ns-proxy-autoconfig' => ['pac'], 'application/x-javascript' => ['js','ls','mocha'], 'application/x-tcl' => ['tcl'], 'application/x-sh' => ['sh'], 'application/x-csh' => ['csh'], 'application/postscript' => ['ai','eps','ps'], 'application/octet-stream' => ['exe','~.*'], # match all extensions 'application/x-cpio' => ['cpio'], 'application/x-gtar' => ['gtar'], 'application/x-tar' => ['tar'], 'application/x-shar' => ['shar'], 'application/x-zip-compressed' => ['zip'], 'application/x-stuffit' => ['sit'], 'application/mac-binhex40' => ['hqx'], 'video/x-msvideo' => ['avi'], 'video/quicktime' => ['qt','mov'], 'video/mpeg' => ['mpeg','mpg','mpe'], 'audio/x-wav' => ['wav'], 'audio/x-aiff' => ['aif','aiff','aifc'], 'audio/basic' => ['au','snd'], 'application/fractals' => ['fif'], 'image/ief' => ['ief'], 'image/x-MS-bmp' => ['bmp'], 'image/x-rgb' => ['rgb'], 'image/x-portable-pixmap' => ['ppm'], 'image/x-portable-graymap' => ['pgm'], 'image/x-portable-bitmap' => ['pbm'], 'image/x-portable-anymap' => ['pnm'], 'image/x-xwindowdump' => ['xwd'], 'image/x-xpixmap' => ['xpm'], 'image/x-xbitmap' => ['xbm'], 'image/x-cmu-raster' => ['ras'], 'image/tiff' => ['tiff','tif'], 'image/jpeg' => ['jpeg','jpg','jpe'], 'image/gif' => ['gif'], 'application/x-fexinfo' => ['texi'], 'application/x-fexinfo' => ['texinfo'], 'application/x-dvi' => ['dvi'], 'application/x-latex' => ['latex'], 'application/x-tex' => ['tex'], 'application/rtf' => ['rtf'], 'text/html' => ['html','htm'], 'text/plain' => ['txt','text'], ); } END { if( $cookie_file ){ webpage->save_cookies($cookie_file); } } =pod =section calculate_auth =H Calculate the authentication string to be used for a page Usage: =verbatim $page->calcuate_auth =noverbatim A page can be retrieven with or without authentication string. The RDF file defines how to retrieve the pages. This method examines the url of the page and creates an array containing all the authentication information that should be tried to download the page. =cut sub calculate_auth { my $self = shift; return if defined $self->{'AUTHLIST'}; for( @noauth_domain ){ if( $self->url =~ /$_->{'DOMAIN'}/ ){ $self->{'AUTHLIST'} = [ { 'REALM' => '.*' , 'AUTH' => '' } ]; last;# if one mathces it is enough. we won't try several times w/o password only once } } $self->{'AUTHLIST'} = [] unless defined $self->{'AUTHLIST'}; for( @auth_domain ){ if( $self->url =~ /$_->{'DOMAIN'}/ ){ push @{$self->{'AUTHLIST'}} , { 'REALM' => $_->{'REALM'} , 'AUTH' => $_->{'AUTH'} }; } } } =pod =section new =H create a new page object This method either creates a new object for the url given as parameter or returns the reference to the object if an object was already created for the url. Usage: =verbatim $page = new webpage ($url); =noverbatim =cut sub new { my $class = shift; my $url = shift; $url =~ s/\#[^\/]*$//; # delete anchor if it exists (page never has an anchor) if( $object_hash{$url} ){# return the object if it was already created return $object_hash{$url}; } my $self = {}; bless $self,$class; $url = 'http://' . $url unless $url =~ m{^http://}; $self->{'URL'} = $url; $self->fetch_state('FRESH'); $object_hash{$url} = $self; return $self; } =pod =section log =H Issue a log intem into the log file This function creates an item into the log file. Usage: =verbatim webpage->log('log message'); =noverbatim =cut sub log { shift; my $text = shift; my $now = time(); my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime($now); $year +=1900; $mon++; $mon = '0' . $mon if $mon < 10; $mday = '0' . $mday if $mday < 10; $sec = '0' . $sec if $sec < 10; $hour = '0' . $hour if $hour < 10; $min = '0' . $min if $min < 10; $text = "$year.$mon.$mday $hour:$min:$sec $text"; print STDERR "$text\n" if $log_stderr_on; print STDOUT "$text\n" if $log_stdout_on; return unless $log_file; return if $log_opened eq 'FAILED'; my $fom = '>'; $fom .= '>' unless $log_opened eq 'NOT YET'; if( ! open(LOG,"$fom$log_file") ){ $log_opened = 'FAILED'; print STDERR "Log file $log_file can not be opened.\n"; return; }else{ $log_opened = 'YES' } print LOG "$text\n"; close LOG; return; } =pod =section define_proxy =H Define a new proxy server This class method is called during configuration. This simpply defined a new proxy server to be used for a certain domain pattern. Usage: =verbatim webpage->define_proxy("http://proxy.myc.com" , "http://mydomain*" ) =noverbatim =cut sub define_proxy { shift; # class method my $proxy = shift; my $domain = shift; $domain = '.*' unless defined $domain; push @proxies , { 'PROXY' => $proxy , 'DOMAIN' => $domain }; } =pod =section save_cookies =H Save all the cookies to a cookies file Usage: =verbatim webpage->save_cookies("fileName"); =noverbatim This class method saves the content of the global array T<@cookies> into the file T given as argument. Each line of the file will contain one cookie in the form: =verbatim NAME=VALUE; expires=DATE;path=PATH; domain=DOMAIN_NAME; secure =noverbatim =cut sub save_cookies { shift; #class my $file = shift; # create backup files if( $cookie_file_backup_nr && -e $file ){ my $i; for $i ( 0 ... $cookie_file_backup_nr-1 ){ $i = ('0' x (length($cookie_file_backup_nr-1)-length($i)) ) . $i; if( ! -e "$file.$i" ){ if( rename $file , "$file.$i" ){ $i++; $i = 0 if $i == $cookie_file_backup_nr; $i = ('0' x (length($cookie_file_backup_nr-1)-length($i)) ) . $i; unlink "$file.$i"; last; } } } } my $timeNow = time(); # expired cookies are not saved if( ! open($file,">$file") ){ webpage->log("ERROR The cookie file $file was not saved."); return; } for $this_cookie ( @cookies ){ if( ( $save_session_cookies && $this_cookie->{'EXPIRES_CONVERTED'} == -1 ) || $this_cookie->{'EXPIRES_CONVERTED'} > $timeNow ){ # not expired yet print $file $this_cookie->{'NAME'},'=',$this_cookie->{'VALUE'}; if( $this_cookie->{'EXPIRES_CONVERTED'} != -1 ){ print $file '; expires=',$this_cookie->{'EXPIRES'}; } print $file '; path=',$this_cookie->{'PATH'}; print $file '; domain=',$this_cookie->{'DOMAIN'}; print $file '; secure' if $this_cookie->{'SECURE'}; print $file "\n"; } } close $file; } =pod =section load_cookies =H Load all the cookies from the cookies file that was created calling save_cookies Usage: =verbatim webpage->load_cookies("fileName"); =noverbatim This class method loads the content of the global array T<@cookies> from the file T given as argument. The old content of the array is not deleted, so cookie files can be merged. This method actually switches off cookie domain compliancy check and calls the method T using a dummy object for each line of the cookie file. =cut sub load_cookies { shift; #class my $file = shift; if( ! open($file,"<$file") ){ webpage->log("ERROR the cookie file $file was not loaded."); return; } my $my_care_cookie_domain = $care_cookie_domain; my $my_count_cookie_dots = $count_cookie_dots; my $dummy = webpage->new('http://www.isys.hu/c/verhas'); # it is a dummy, it could be any url while( <$file> ){ chomp; $dummy->set_cookie($_); } close $file; my $care_cookie_domain = $my_care_cookie_domain; my $count_cookie_dots = $my_count_cookie_dots; } =pod =section list_cookies =H List all cookies that are to be sent with the http request Usage: =verbatim print "Cookie: ",$page->list_cookies,"\n"; =noverbatim This method checks the global array T<@cookies> and creates a string of the format =verbatim NAME1=OPAQUE_STRING1; NAME2=OPAQUE_STRING2 ... =noverbatim including all the cookies that fit the URL domain, path and is not expired yet. =cut sub list_cookies { my $self = shift; my $i,$j; # index variables for sorting cookies my $domain = $self->host; my $path = $self->path; my $timeNow = time(); # expired cookies are not listed my $this_cookie; my @cookie_list = (); for $this_cookie ( @cookies ){ my $domainpattern = '.*' . quotemeta($this_cookie->{'DOMAIN'}) . '$'; my $pathpattern = quotemeta($this_cookie->{'PATH'}) . '.*$'; if( $domain =~ /$domainpattern/ && $path =~ /$pathpattern/ && # domain and path is OK ($this_cookie->{'EXPIRES_CONVERTED'} == -1 || # there is no expiration defined $this_cookie->{'EXPIRES_CONVERTED'} > $timeNow ) ){ # or not expired yet push @cookie_list , $this_cookie; } } # simple buble sort to order the cookies so that the more # precise path comes first as it is required by the spec. for $i ( 0 .. $#cookie_list-1 ){ for $j ( $i+1 .. $#cookie_list ){ if( length($cookie_list[$i]->{'PATH'}) < length($cookie_list[$j]->{'PATH'}) ){ my $swap = $cookie_list[$i]; $cookie_list[$i] = $cookie_list[$j]; $cookie_list[$j] = $swap; } } } my $list = ''; for( @cookie_list ){ $list .= '; ' if $list; $list .= $_->{'NAME'} . '=' . $_->{'VALUE'}; } return $list; } =pod =section set_all_cookies =H Set all cookies of a retrieved page Usage: =verbatim $page->set_all_cookies; =noverbatim This method calls the method R method for each cookie sent with the page and stores the cookies sent in the global T<@cookies> array. =cut sub set_all_cookies { my $self = shift; if( defined $self->{'HEADER'}->{'set-cookie'} ){ $self->set_cookie($self->{'HEADER'}->{'set-cookie'}); } if( defined($self->{'HEADERc'}->{'set-cookie'}) && $#{$self->{'HEADERc'}->{'set-cookie'}} > -1 ){ my $thisccokie; for $thiscookie ( @{$self->{'HEADERc'}->{'set-cookie'}} ){ $self->set_cookie($thiscookie); } } } =pod =section set_cookie =H Set a new cookie or update an old one Usage: =verbatim $page->set_cookie("cookie string with options") =noverbatim This method sets a new cookie and modifies the T<@cookies> global array. =cut sub set_cookie { my $self = shift; my $cookie_string = shift; # NAME=VALUE; expires=DATE;path=PATH; domain=DOMAIN_NAME; secure my ($expires,$path,$domain,$secure); my $i; # loop index variable going through the cookie array $cookie_string =~ s/\s*(.*?)\s*;//; my $name_value = $1; my ($name,$value) = split(/=/ , $name_value); my @params = split(/;/,$cookie_string); for( @params ){ s/^\s*//; s/\s*$//; if( /^secure$/i ){ $secure = 1; next; } if( /expires\s*=\s*(.*)/i ){ $expires = $1; next; } if( /path\s*=\s*(.*)/i ){ $path = $1; next; } if( /domain\s*=\s*(.*)/i ){ $domain = $1; next; } } $path = $self->path unless defined $path; # path that the cookie comes from $secure = 0 unless defined $secure; # it is not secure by default if( defined $expires ){ $expires_converted = webpage->cookieT2time($expires); }else{ $expires_converted =-1 } # expires by the end of session if( defined( $domain ) ){ if( $care_cookie_domain ){ my $domainpattern = '.*' . quotemeta($domain) . '$' ; if( $self->host !~ /$domainpattern/ ){ webpage->log("ERROR cookie domain $domain has not fit sending url ", $self->url); return; } } }else{ # default is the machine that sends the cookie and it obviously fits itself $domain = $self->host; } if( $count_cookie_dots ){ # calculate the number of dots in the domain parameter # the spec requires at least two or three my $dot_nr = length( $domain =~ /\./g ); if( $count_cookie_dots == 2 || $domain =~ /\.gov/i || $domain =~ /\.edu/i || $domain =~ /\.net/i || $domain =~ /\.org/i || $domain =~ /\.com/i || $domain =~ /\.mil/i || $domain =~ /\.int/i ){ $dot_nr -= 2; }else{ $dot_nr -= 3 } if( $dot_nr < 0 ){ webpage->log("ERROR cookie domain $domain has not enough dots in it."); return; } } for $i ( 0 .. $#cookies ){ if( $cookies[$i]->{'NAME'} eq $name && $cookies[$i]->{'DOMAIN'} eq $domain && $cookies[$i]->{'PATH'} eq $path ){ $cookies[$i]->{'VALUE'} = $value; $cookies[$i]->{'EXPIRES'} = $expires; $cookies[$i]->{'EXPIRES_CONVERTED'} = $expires_converted; $cookies[$i]->{'SECURE'} = $secure; # later version may support HTTPS return; # the old cookie is stored with the new value } } push @cookies , { 'DOMAIN' => $domain, 'PATH' => $path, 'NAME' => $name, 'VALUE' => $value, 'EXPIRES' => $expires, 'EXPIRES_CONVERTED' => $expires_converted , 'SECURE' => $secure # later version may support HTTPS }; return; # the cookie was stored as a new value } =pod =section cookieT2time =H Convert cookie expiration time to GMT time seconds Usage: =verbatim webpage->cookieT2time("cookie expiration string") =noverbatim Convert a time value given in the cookie to seconds as given by the system function T. Expiration time in the cookies are given in the format T We do not care the T part as this is redundant, and do not care the T as that should always be T and no other timezone is permitted. This class method examines the string, splits it into parts and calculates the seconds between January 1, 1970. and the time given by the string. This is a class method. =cut sub cookieT2time { shift; #class my $cookie_time = shift; # of the format Wdy, DD-Mon-YY HH:MM:SS GMT # convert the string into separate values $cookie_time =~ /.*\,\s*(\d+)\-(\w+)\-(\d+)\s*(\d+):(\d+):(\d+)/i; my ($mday,$moname,$year,$hour,$min,$sec) = ($1,$2,$3,$4,$5,$6); # the stupid cookie time format is not Y2K # we do our best works from 1998 till 2098 and when year is given as YYYY if( length($year) == 2 ){ if( $year eq '98' || $year eq '99' ){ $year += 1900; }else{ $year += 2000; } }# now we have a four digit year # get the serial number of the month $month = { 'jan' => 0 , 'feb' => 1 , 'mar' => 2 , 'apr' => 3 , 'may' => 4 , 'jun' => 5 , 'jul' => 6 , 'aug' => 7 , 'sep' => 8 , 'oct' => 9 , 'nov' => 10 , 'dec' => 11 } -> { lc $moname }; # count the days since the epoch (January 1, 1970.) my $ds = 0; for(1970 .. $year ){ $ds += ((($_ % 4) == 0) && ( ($_ % 100) || (($_ % 400)==0) )) ? 366 : 365 } # summ up the day of the months until the current for ( 0 .. $month-1 ){ $ds += $mdays[$_] ; } # add one if this is a leap year and we are after february if( ((($_ % 4) == 0) && ( ($_ % 100) || (($_ % 400)==0) )) && $month >1 ){ $ds++ } # summ up the days of the current months $ds += $day; # now convert all this stuff to seconds $ds *= 86400; # add hour, minute and seconds values $ds += $sec + 60*($min + 60*$hour); return $ds; } =pod =section normalize_url =H Normalize a url Get a url as an argument and check if an ip number is given instead of a host name. If the ip number is specified then replace it with the host name and return the resulting url. This is a class method. Usage: =verbatim $url = webpage->normalize_url($url); =noverbatim Sometimes getting a page using symbolic host name the webserver answers a redirect to an URL which is the same as the requested but having the machine ip number instead of the symbolic name (reason?). If we used the the ip number instead of the symbolic name we could easily get out of the download domain that the user usually defines using the symbolic names. Therefore this method is called whenever the page is going to be saved and whenever the url is check if it is in the domain or not. This method is also used to decide if a cookie is to be sent to the server. =cut sub normalize_url{ shift; my $url = shift; if( $url =~ m{^http://([\w-\.]+):?(\d*)(/.*)?} ){ my $host = $1; #URL host my $port = $2; #URL port my $path = $3; #URL path if( $host =~ m{\d+\.\d+\.\d+\.\d+} ){ # this is an ip number $host = $ip_host{$host} if defined $ip_host{$host}; my $ourl = $url; $url = 'http://' . $host; $url .= ':' . $port if $port; $url .= $path if $path; webpage->log("NORMALIZED $ourl to $url"); return $url; }else{ return $url; } }else{ return $url; } } =pod =section host =H Return the host of a page Calculate the host of a page. This method is used in cookie handling. This method uses the URL of a page and extracts the host from it. If this value is numeric (like 193.68.44.152) it tries to see if this host is already known by its name (calls the method T). As a side effect this method also calculates the R value of a page. Usage: =verbatim $page->host; =noverbatim =cut sub host { my $self = shift; if( ! defined $self->{'HOST'} ){ webpage->normalize_url($self->url) =~ m{^http://([\w-\.]+):?(\d*)(/.*)?}; $self->{'HOST'} = $1; #URL host $self->{'PATH'} = $3; if( $self->{'PATH'} =~ /\.(\d\w)+$/ ){ # path ends with file name $self->{'PATH'} =~ s{/[^/]*$}{/}; # delete the file name }else{ $self->{'PATH'} .= '/' unless $self->{'PATH'} =~ m{/$}; } } return $self->{'HOST'}; } =pod =section path =H Return the path of a page Calculate the path of a page. This method is used in cookie handling. This method uses the URL of a page and extracts the path from it. Usage: =verbatim $page->path; =noverbatim If the path value was not calculated before this method invokes the method R to calculate the host and the path values as the same time. =cut sub path { my $self = shift; if( ! defined $self->{'PATH'} ){ $self->host; # calculates also PATH } return $self->{'PATH'}; } =pod =section url =H Return the url of a page This is just a stub to get the url of the page. Usage: =verbatim $page->url; =noverbatim =cut sub url { my $self = shift; return $self->{'URL'}; } =pod =section define_interface =H Define a new interface This class method is called during configuration. This method defines a new interface which should be used during download. The parameters of the method define the IP name of the interface and a pattern for which the interface is to be used. Usage: =verbatim webpage->define_interface($interface,$pattern); =noverbatim =cut sub define_interface { shift; # class method my $if = shift; my $pattern = shift; if( ! $interfaces_defined ){ @interfaces = (); %interfaces = ( ); $interfaces_defined = 1; } push @interfaces , $if; $interfaces{$if} = $pattern; } =pod =section fetch_state =H Set or get the fetch_state of a page Usage: =verbatim $page->fetch_state or $page->fetch_state($state) =noverbatim During the retrieval process a page can have the follwing states: =itemize =item T the page object was generated, but nothing has beed done to the page =item T the page object was scheduled for retrieval =item T the page object was retrieved sucessfully or unsuccessfully via a http GET method =item T the page was saved to the disk and the content was destroyed (see R). =noitemize =cut sub fetch_state { my $self = shift; my $state = shift; $self->{'FETCH_STATE'} = $state if defined $state; return $self->{'FETCH_STATE'}; } =pod =section content_type =H Return the content type header information Usage: =verbatim $page->content_type =noverbatim This method actually returns a stripped content type that does not include character code page or anything alike, which can stand after a semicolon. =cut sub content_type { my $self = shift; if( ! defined $self->{'STRIPPED_CONTENT_TYPE'} ){ $self->{'STRIPPED_CONTENT_TYPE'} = $self->{'HEADER'}->{'content-type'}; $self->{'STRIPPED_CONTENT_TYPE'} =~ s/\s*;.*//; } return $self->{'STRIPPED_CONTENT_TYPE'}; } =pod =section level =H set or get the level of a page Each page has a level which measures the distance of the page from the starting page(s). The starting pages have level one. Each page referenced by a page having level N has level N+1. This method sets or retrieves the level of a page. If the level that the argument specifies is higher than the level that the page already has then the higher value is abandoned, because there is a shorter way getting to this page from the start pages. If the page has a level and the new level is smaller then the new level is used. Usage: =verbatim $page->level or $page->level($level) =noverbatim =cut sub level{ my $self = shift; my $level = shift; $self->{'LEVEL'} = $level if defined($level) && (!defined($self->{'LEVEL'}) || $self->{'LEVEL'} > $level); return $self->{'LEVEL'}; } =pod =section schedule =H Schedule a page to be retrieved Put the page object on the schedule list and set its state to T. Usage: =verbatim $page->schedule =noverbatim =cut sub schedule{ my $self = shift; my $priority = shift; $priority = 0 unless defined $priority; return if $self->fetch_state ne 'FRESH'; webpage->log('SCHEDULE ' . $self->url); push @schedule_list, { 'OBJECT' => $self , 'PRIORITY' => $priority }; $self->fetch_state('SCHEDULED'); } =pod =section next_page =H Get the next page to be retrieved Get the next page to be retrieved from the schedule list and return the object. Usage: =verbatim $page->next_page =noverbatim returns the page object =cut sub next_page { my $sle = pop @schedule_list; return $sle->{'OBJECT'}; } =pod =section destroy =H Destroy a page This method deletes the content and the split content of a page. This can save up huge memory after a big GIF, JPG or HTML file was saved. The page however is still alive, because other pages may reference it. Usage: =verbatim $page->destroy; =noverbatim =cut sub destroy { my $self = shift; delete $self->{'CONTENT'}; # this is the most memory hungry part, but the rest is kept for reference delete $self->{'SPLIT'}; } =pod =section cwd =H Set current working directory Sets the directory where the pages should be saved. This directory is prepended to the relative directory string calculated by R. Usage: =verbatim webpage->cwd($directory) =noverbatim =cut sub cwd { shift; my $cwd = shift; if( defined($cwd) ){ $save_directory = $cwd; } return $save_directory; } =pod =section proxy =H Set or retrieve the proxy When a page has a proxy defined the method R tries to retirve it using the proxy. This method sets or retrieves the proxy for the page Usage: =verbatim $page->proxy or $page->proxy($proxy) =noverbatim =cut sub proxy { my $self = shift; my $proxy = shift; $self->{'PROXY'} = $proxy if defined $proxy; $self->{'PROXY'}; } =pod =section relate =H Calculate relative URL Pages A and B are retrieved, both are to be saved and A has a reference to B. The reference in A referencing B can be absolute or relative. However after saving both of them the saved form of A B have a relative reference to the saved location of B. This method calculates and returns the relative url from A to B. Usage: =verbatim $href = $pageA->relate($pageB) =noverbatim where both T<$pageA> and T<$pageB> are objects to pages. =cut sub relate { my $self = shift; my $rpage = shift; my $file = $rpage->file; my $return; my $this_file; my @base_path = split( m{/} , $self->file ); my @rela_path = split( m{/} , $file ); if( $#base_path > -1 && $base_path[$#base_path] =~ /\./ ){ $this_file = pop @base_path; # pop off file name }else{ $this_file = ''; } while( 1 ){ last if $#base_path == -1 || $#rela_path == -1 || $base_path[0] ne $rela_path[0] ; shift @base_path; shift @rela_path; } while( $#base_path > -1 ){ shift @base_path; unshift @rela_path , '..'; } $return = join( '/' , @rela_path ); $return = '' if $this_file && $this_file eq $return; return $return; } =pod =section split_html =H Split a HTML content into texts and tags array An HTML content is viewed as an array of texts and tags. This method takes the content of a page and creates an array of texts and tags. Usage: =verbatim $page->split_html =noverbatim See also R. =cut sub split_html { my $self = shift; my $html = \$self->{'CONTENT'}; my @tags = (); my $index,$cpos = 0; while( 1 ){ my $text; $index = index( $$html, '<' , $cpos); if( $index > -1 ){ $text = substr($$html,$cpos,$index-$cpos); }else{ $text = substr($$html,$cpos); } if( $text ){ push @tags , { TYPE => 'TEXT' , CONTENT => $text }; } last if $index == -1; $index ++;# step over the '<' $cpos = index( $$html , '>' , $index ); if( $cpos > -1 ){ $text = substr($$html,$index,$cpos-$index); }else{ $text = substr($$html,$index); } if( $text ){ if( $text =~ /\!/ ){# this is a comment, and not a real tag # treat it as pure text push @tags , { TYPE => 'TEXT' , CONTENT => "<$text>" }; # # LATER WE SHOULD DEAL WITH TAGS AS TEXT AS WELL # OR IMPLEMET A JAVASCRIPT ENGINE THAT CAN CALCULATE ALL THE POSSIBLE # HREF VALUES THAT A JAVASCRIPT CAN CALCULATE (HA HA HA :-) # }else{ my @tagd = &tagdisa($text); push @tags , { TYPE => 'TAG' , CONTENT => \@tagd }; } } last if $cpos == -1; $cpos++;# step over the '>' } $self->{'SPLIT'} = \@tags; delete $self->{'CONTENT'}; } # # Get a string containing a html tag and # return a two element list. The first element of the list # is the type of the tag (like A, B, P, H1). The second element is # a reference to a hash. Each element contains the tag parameter name # and value, like (HREF,http://www.mycom.com) for an A tag. # # THIS IS A FUNCTION NOT A METHOD USED ONLY INTERNALLY!! sub tagdisa { my $tag = shift; my %params = (); $tag =~ s/^\s*//; # delete leading spaces $tag =~ s/^(\/?\w+)//; # get the type of the tag, NAME or /NAME my $type = lc $1; $tag =~ s/^\s*//; # delete space after while( $tag ){ $tag =~ s/^([^\s\=]+)//; # get parameter name my $parn = lc $1; $tag =~ s/^\s*//; # delete space after the parameter value # if there is no = unless( $tag =~ s/^\=// ){ # then it is a parameter without value $params{$parn} = ''; next; } $tag =~ s/^\s*//; # delete space after the = my $parv = ''; # no parameter value so far if( $tag =~ s/^\"// ){ # starts with " like href="xxx" $tag =~ s/^([^"]*)//; # anything until the trailing " $parv = $1; $tag =~ s/^\"//; # delete the trailing " }else{ # does not start with " like SIZE=3 $tag =~ s/^(\S*)//; # anything until space $parv = $1; } $tag =~ s/^\s*//; # delete space after the parameter value $params{$parn} = $parv; } return ($type, \%params); } =pod =section rebuild_content =H rebuild the content of a html file After the html content was split (see R) and the appropriate references were modified the content should be rebuilt. This method recreates the modified content from the split format. Usage: =verbatim $page->rebuild_content =noverbatim =cut sub rebuild_content { my $self = shift; my $tag; $self->{'CONTENT'} = ''; # we delete and the rebuild the content for $tag ( @{$self->{'SPLIT'}} ){ if( $tag->{'TYPE'} eq 'TEXT' ){ $self->{'CONTENT'} .= $tag->{'CONTENT'}; }else{ my $tagtype = $tag->{'CONTENT'}->[0]; my $tags = $tag->{'CONTENT'}->[1]; $self->{'CONTENT'} .= "<$tagtype"; while( ($tagkw,$tagv) = each %{$tags} ){ $self->{'CONTENT'} .= " $tagkw"; $self->{'CONTENT'} .= "=\"$tagv\"" if defined $tagv; } $self->{'CONTENT'} .= '>'; } } delete $self->{'SPLIT'}; } =pod =section urel2abs =H Convert url to absolute This method convert a relative url to absolute. This is needed for the retrieval. The argument of the method can be a relative url from a T or T tag of the html of the page or can be absolute. The returned url is an absolute url. Usage: =verbatim $absolute_url = $page->urel2abs($href_url); =noverbatim =cut sub urel2abs { my $self = shift; my $relURL = shift; # relative or absolute URL my $baseURL = $self->{'URL'}; my $i,$return; return $relURL if $relURL =~ m{^\w+:}; $baseURL = 'http://' . $baseURL unless $baseURL =~ m{^http://}i; $baseURL =~ m{^http://([\d\w-\.]+):?(\d*)(/.*)?(\#.*)?}i; my ($baseHost, $basePort, $basePath, $baseAnchor) = ( $1, $2, $3, $4); $basePath =~ s{^/}{}; if( $relURL =~ m{^/} ){ $return = 'http://' . $baseHost; $return .= ":$basePort" if $basePort ; $return .= $relURL; return $return; } if( $relURL =~ m{^\#} ){ $return = 'http://' . $baseHost; $return .= ":$basePort" if $basePort ; $return .= "/$basePath"; $return .= $relURL; return $return; } my @relList = split( m{/} , $relURL ); my @absList = split( m{/} , $basePath ); # trailing file name should be thrown away if( $#absList > -1 ){ my $fileName = pop @absList; push @absList , $fileName unless $fileName =~ m{\.}; } push @absList , @relList; @relList = (); while( $i = shift @absList ){ if( $i eq '..' ){ pop @relList; }else{ push @relList , $i; } } $relURL = '/' . join( '/' , @relList ); $return = 'http://' . $baseHost; $return .= ":$basePort" if $basePort ; $return .= $relURL; return $return; } =pod =section state_message =H return the page state message Returns the English language message of the HTTP state of the page. These are like "OK 200" or "404 Not found" to mention the most well known examples. There are also states that may come from the socket handling process, when retrieval can not get the page. Usage: =verbatim print $page->state_message =noverbatim =cut sub state_message { my $self = shift; my %status_messages = ( -1, "Could not lookup server", -2, "Could not open socket", -3, "Could not bind socket", -4, "Could not connect", -5, "Retrieved file small", -6, "Save file cannot be opened", -7, "Page is too big.", 200, "OK 200", 201, "CREATED 201", 202, "Accepted 202", 203, "Partial Information 203", 204, "No Response 204", 301, "Found, but moved", 302, "Found, but data resides under different URL (add a /)", 303, "Method", 304, "Not Modified", 400, "Bad request", 401, "Unauthorized", 402, "PaymentRequired", 403, "Forbidden", 404, "Not found", 500, "Internal Error", 501, "Not implemented", 502, "Service temporarily overloaded", 503, "Gateway timeout ", 600, "Bad request", 601, "Not implemented", 602, "Connection failed (host not found?)", 603, "Timed out", ); return undef unless defined $self->{'STATE'}; $self->{'STATE_MESSAGE'} = $status_messages{$self->{'STATE'}}; } =pod =section get =H get a page Get a page using http GET. Usage: =verbatim $page->get =noverbatim =cut sub get { my $self = shift; my $interface; my $buffer; return if $self->fetch_state eq 'RETRIEVED'; my $URL = $self->{'URL'}; webpage->log("GET $URL"); for( @proxies ){ my $domain = $_->{'DOMAIN'}; if( $self->url =~ m{$domain} ){ $self->proxy($_->{'PROXY'}); last; } } my $PROXY = $self->{'PROXY'}; # split the URL to (host, port, path) if( $PROXY ){#using proxy $PROXY =~ m{(?:http://)?([\w-\.]+):?(\d*)}; $host = $1; #proxy host $port = $2; #proxy port $path = $URL; #Using proxy the path is the full URL if ($port eq "") { $port = 8080; } }else{#if do not use proxy if ($URL =~ m#^http://([\w-\.]+):?(\d*)(/.*)?#) { $host = $1; #URL host $port = $2; #URL port if( $use_full_url ){ $path = $URL; #send the full path }else{ $path = $3; #URL path } } if ($path eq "") { $path = '/'; } if ($port eq "") { $port = 80; } } $AF_INET = 2; $SOCK_STREAM = 1; $sockaddr = 'S n a4 x8'; if( $host_ip{$host} ){ $thataddr = $host_ip{$host}; }else{ if (!(($name,$aliases,$type,$len,$thataddr) = gethostbyname($host))){ $self->{'STATE'} = -1; return; } $host_ip{$host} = $thataddr; $ip_host{ sprintf("%d.%d.%d.%d",ord(substr($thataddr,0,1)), ord(substr($thataddr,1,1)), ord(substr($thataddr,2,1)), ord(substr($thataddr,3,1)) ) } = $host unless $host =~ m{\d+\.\d+\.\d+\.\d+}; } my $thataddr_ip_string = sprintf("%d.%d.%d.%d",ord(substr($thataddr,0,1)), ord(substr($thataddr,1,1)), ord(substr($thataddr,2,1)), ord(substr($thataddr,3,1)) ); $interface = ''; for( @interfaces ){ if( $host =~ m{$interfaces{$_}} || $thataddr_ip_string =~ m{$interfaces{$_}} ){ $interface = $_; last; } } if( ! $interface ){ webpage->log("ERROR no interface defined for $host"); webpage->log("QUITting"); exit; } if( $host_ip{$interface} ){ $thisaddr = $host_ip{$interface}; }else{ if (!(($name,$aliases,$type,$len,$thisaddr) = gethostbyname($interface))){ webpage->log("ERROR can not bind local interface \"$interface\""); webpage->log("QUITting"); $self->{'STATE'} = -1; return; } $host_ip{$interface} = $thisaddr; $ip_host{ sprintf("%d.%d.%d.%d",ord(substr($thisaddr,0,1)), ord(substr($thisaddr,1,1)), ord(substr($thisaddr,2,1)), ord(substr($thisaddr,3,1)) ) } = $interface unless $interface =~ m{\d+\.\d+\.\d+\.\d+}; } ($name,$aliases,$proto) = getprotobyname('tcp'); $this = pack($sockaddr, $AF_INET, 0, $thisaddr); $that = pack($sockaddr, $AF_INET, $port, $thataddr); # Make the socket filehandle. if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))){ $self->{'STATE'} = -2; $self->state_message; return; } # Give the socket an address if (!(bind(S, $this))) { $self->{'STATE'} = -3; $self->state_message; return; } my $time1 = time(); if (!(connect(S,$that))) { $self->{'STATE'} = -4; $self->state_message; return; } select(S); $| = 1; select(STDOUT); my $old_separator = $/; $/ = "\n"; my $time2 = time(); print S "GET $path HTTP/1.0\n"; print S "User-Agent: $USERAGENT\n"; print S "Accept: */*\n"; if( $do_cookies ){ my $ck = $self->list_cookies; if( $ck ){ #if there are any cookies print S "Cookie: ",$ck,"\n"; webpage->log("COOKIE: $ck"); } } my $auth = shift @{$self->{'AUTHLIST'}}; if( defined $self->realm ){ while( 1 ){ last if $self->realm =~ $auth->{'REALM'}; last unless defined($auth = pop @{$self->{'AUTHLIST'}}); } } if( $auth->{'AUTH'} ){ print S 'Authorization: Basic ' . $auth->{'AUTH'} . "\n"; } print S "\n"; binmode S; $response = ; chomp $response; ( $self->{'PROTOCOL'} , $self->{'STATE'} ) = split(/ /, $response); $self->state_message; webpage->log('HEADER State: ' . $self->{'STATE'}); $self->{'CONTENT_LENGTH'} = 0; while(){ chomp; if( $_ eq chr(13) || $_ eq "" ){ last; } while( s/\n$// || s/\r$// ){}# delete all \n and \r from the end # hopefully there are no such characters inside /([^\s\:]+):\s+(.*)/; # split the header line # store the header information in the header hash # if a header has multiple values then the second and the rest # is stored in the header continuation hash array if( defined $self->{'HEADER'}->{lc($1)} ){ push @{$self->{'HEADERc'}->{lc($1)}} , $2; }else{ $self->{'HEADER'}->{lc($1)} = $2; } webpage->log("HEADER $1: $2"); } $self->set_all_cookies if $do_cookies; $self->{'CONTENT'} = ''; # some servers report extraordinary large content-length (a few gigs), then it fails. if( $cl_believe && $pagesizelimit && $self->{'HEADER'}->{'content-length'} && $pagesizelimit < $self->{'HEADER'}->{'content-length'} ){ $self->{'STATE'} = -7; $self->state_message; return; } my $total = 0; for( $i=read(S,$buffer,1024) ; $i > 0 ; $i=read(S,$buffer,1024)){ $total += $i; # count the total downloaded bytes if( $pagesizelimit && $pagesizelimit < $total ){ $self->{'CONTENT'} = ''; # exit if file is too large $self->{'STATE'} = -7; $self->state_message; return; } $self->{'CONTENT'} .= $buffer; } my $time3 = time(); close(S); $self->{'CONTENT_LENGTH'} = length $self->{'CONTENT'}; my $dnrate; if( $time3 - $time2 != 0 ){ $dnrate = length($self->{'CONTENT'}) / ($time3 - $time2) / 128; if( $dnrate =~ /(\d+).(\d*)/ ){ $dnrate = $1 . '.' . ($2 ? substr($2,0,2) : '00'); }else{ $dnrate .= '.00'; } }else{ $dnrate = 'N/A ' } webpage->log('RETRIEVED ' . length($self->{'CONTENT'}) . 'bytes; Connect time=' . ($time2-$time1) . 'sec; Download time=' . ($time3 - $time2) . 'sec; Speed=' . $dnrate . 'Kbps; state=' . $self->{'STATE'}); $/ = $old_separator; $self->fetch_state('RETRIEVED'); if( $self->content_type ne 'text/html' ){ $self->create_file; } return; } =pod =section realm =H Return the realm of a page Returns the realm that the webserver named for authentication when the page was downloaded (or at least there was a trial to download it). As a side effect this method also calculates the R value for the page. Usage: =verbatim $page->realm =noverbatim =cut sub realm { my $self = shift; my $www_authenticate; return $self->{'REALM'} if $self->{'REALM'}; if( defined($www_authenticate=$self->{'HEADER'}->{'www-authenticate'}) ){ if( $www_authenticate =~ /\s*(\w+)\s*realm=(.*)$/ ){ $self->{'AUTH_TYPE'} = lc $1; $self->{'REALM'} = $2; }else{ return undef } }else{ return undef } } =pod =section auth_type =H Return the authentication type of a page Returns the authentication type that the webserver named for authentication when the page was downloaded (or at least there was a trial to download it). This type should be T or we are out of luck. Usage: =verbatim $page->auth_type =noverbatim If this parameter was not extracted yet from the header lines before this method calls R to perform the calculation. =cut sub auth_type{ my $self = shift; $self->realm; return $self->{'AUTH_TYPE'}; } =pod =section file =H Create a file name for an url Return the file name of the page where it is going to be saved or where it was saved. Usage: =verbatim $page->file =noverbatim =cut sub file { my $self = shift; return $self->{'FILE'} if defined $self->{'FILE'}; $self->get; # get it to have the content type # method get calls create file for all non text/html mime types # which means calling this method recursively. So when it # calculates and returns to create_file and then returns to method get # and returns to here this parameter is already calculated... return $self->{'FILE'} if defined $self->{'FILE'}; my $URL = webpage->normalize_url($self->url); my $path; my $port; if( $URL =~ m{^http://([\w-\.]+):?(\d*)(/.*)?} ){ $host = $1; #URL host $port = $2; #URL port $path = $3; #URL path } if ($port eq '') { $port = 80; } if( $path =~ m{^/} ){ $path = substr($path,1) } # convert all characters that are active like ?g=1&h%20=14 $path =~ tr/\?/\//; $path =~ tr/\=/\//; $path =~ tr/\&/\//; # to avoid too long file names $path =~ s/\$/\$24/g; # convert $ to $24 # convert all other character to $xx where xx is hexa ASCII while( $path =~ m{([^\d\w./\$\-\_])} ){ my $orc = $1; my $rep = sprintf("%02X",ord($orc)); $orc = quotemeta $orc; $path =~ s/$orc/\$$rep/g; } $path =~ s{//}{/a/}g; # sometimes it happens that there are two neighbouring characters converted to / my @dlist = split(/\./ , $host); @dlist = reverse( @dlist ); push @dlist , $port; if( $map_method eq 'simple' ){ @dlist = (); } push @dlist , split(/\// ,$path); my $file_name = pop @dlist; push @dlist , $file_name; if( $file_name =~ /\.([^.]*)$/ ){ my $extension = $1; my $pextarr = $mime_extension{$self->content_type}; my $append = '.' . $pextarr->[0]; for( @{$pextarr} ){ my $pat = $_; if( substr($pat,0,1) eq '~' ){ $pat = substr($pat,1); if( lc($extension) =~ /$pat/ ){ $append = ''; last; } }else{ if( lc($extension) eq $_ ){ $append = ''; last; } } } $file_name .= $append; pop @dlist; push @dlist , $file_name; }else{ if( defined($file_name=$self->{'HEADER'}->{'content-location'})){ my @loca = split(/\//,$file_name); $file_name = pop @loca; }else{ if( defined($mime_extension{$self->content_type}) ){ $file_name = 'index.' . $mime_extension{$self->content_type}->[0]; }else{ $file_name = 'index.html'; } # we are practically lost at this point } push @dlist , $file_name; } if( $map_method eq 'flat' ){ $file_name =~ /\.([^.]*)$/; $self->{'FILE'} = $map_counter++ . ".$1"; }else{ $self->{'FILE'} = join('/',@dlist); } } =pod =section file_exists =H Checks file existence Returns true if the file of the page already exists. Usage: =verbatim $page->file_exists =noverbatim This method is not used in current version, but somehow it survived during development. Zap it! =cut sub file_exists { my $self = shift; my $file = $self->cwd . '/' . $self->file; -e $file; } =pod =section create_file =H Create the file for a page Create a new file and save the content of the page into it. Usage: =verbatim $page->create_file =noverbatim =cut sub create_file { my $self = shift; return if $self->fetch_state eq 'SAVED'; my $file = $self->cwd . '/' . $self->file; $file =~ s{\\}{/}g; # convert all \ to / $file =~ s{//}{/}g; #delete double slashes if any remained my @dlist = split( m{/} , $file); pop @dlist; #remove the trailing file name if( $#dlist == -1 ){ return; }#this is a simple file name in the current directory my $cwd = shift @dlist;#take the first subdirectory if( ! -d $cwd ){#if does not exist create it mkdir $cwd,$umask; } for( @dlist ){ $cwd .= "/$_";#take the next subdirectory if( ! -d $cwd ){ mkdir $cwd,$umask; #if does not exist create it } } webpage->log("SAVED $file of type " . $self->content_type); my $FH = $self->{'URL'}; open($FH,">$file") or die "$file can not be opened."; binmode $FH unless $self->content_type =~ /^text/; print $FH $self->{'CONTENT'}; close $FH; $self->fetch_state('SAVED'); $self->destroy; } =pod =section length =H length of the content of a page This method returns the length of the content of a page. The length is set by the method GET after the page was retrieved. This method is needed because older version used T>T<{'CONTENT'}> but that does not work after the page was saved and destroyed. =cut sub length { my $self = shift; return $self->{'CONTENT_LENGTH'}; } =pod =section text_leaf =H Create a leaf text as HTML content Create a text and put it into the content of the page. The content says that the page was not downloaded and can be found here and there. There is also some meta tag that helps the navigation automatically bringing the browser to the original URL. (Refresh after 1 sec.) =cut sub text_leaf { my $self = shift; my $url = $self->url; $self->{'CONTENT'} = < Page not retrieved This page was not retrieved, and can be found at its original location: $url END_HTML } =pod =section base64 =H Encodes argument using base64 encoding This method codes a string to its base64 format as needed by basic authentication. Usage: =verbatim webpage->base64 =noverbatim =cut sub base64{ shift; my $res = ""; pos($_[0]) = 0; # ensure start at the beginning while ($_[0] =~ /(.{1,45})/gs) { $res .= substr(pack('u', $1), 1); chop($res); } $res =~ tr|` -_|AA-Za-z0-9+/|; # `# help emacs # fix padding at the end my $padding = (3 - length($_[0]) % 3) % 3; $res =~ s/.{$padding}$/'=' x $padding/e if $padding; $res; } =pod =section text_badstate =H Create a leaf text as HTML content Create a text and put it into the content of the page. The content says that the page was not downloaded because an error and can be found here and there if luck is better now. There is also some meta tag that helps the navigation automatically bringing the browser to the original URL. (Refresh after 1 sec.) =cut sub text_badstate { my $self = shift; my $state_message = $self->state_message; my $status = $self->{'STATE'}; my $proxy = $self->proxy; my $url = $self->url; $self->{'HEADER'}->{'content-type'} = 'text/html'; $self->{'CONTENT'} = < Page not retrieved This page was not retrieved, because the webserver (or the proxy $proxy) returned an error code instead of the resource.

The error code was $status.

The official explanation of this code is:

$state_message

You can try to retrieve the resource right now from $url

Good luck! END_HTML } =pod =section text_redirect =H Create a leaf text as HTML content Create a text and put it into the content of the page. The content says that the page was redirected and points to the file, where the redirected page was saved. There is also some meta tag that helps the navigation automatically bringing the browser to the new page. (Refresh after 1 sec.) =cut sub text_redirect{ my $self = shift; my $file = shift; $self->{'HEADER'}->{'content-type'} = 'text/html'; # it is probably already, but who knows $self->{'CONTENT'} = < Page redirected This page has been moved to $file END_HTML } =pod =section history =H Version History of webmirror pro Webmirror history =itemize =item T<2.05> Directive T now works and is the same as T =item T< > Anchors now seem to work. =item T<2.04> Directive T is introduced and sending full URL for non proxy get also. =item T<2.03> new T and bug fix in T. Previous versions in this sub used the variable T<$page> instead of T<$self> which was a typo. The result was that some pages were not retrieved because the program though they were too deep, altough they were not. =item T<2.02> bug fix : some files were saved twice. The problem was that the second save was done with the destroyed content and it created zero length files. =item T<2.01> cookie handling =item T<2.0> new version with total redesign =item T<1.0> old version =noitemize =cut