#!/usr/bin/perl
# LEGAL WARNING:
#
# This software is provided on an "AS IS", basis,
# without warranty of any kind, including without
# limitation the warranties of merchantability, fitness for
# a particular purpose and non-infringement. The entire
# risk as to the quality and performance of the Software is
# borne by you. Should the Software prove defective, you
# and not the author assume the entire cost of any service
# and repair.
#

#
# The source documentation of this software is maintained by the program esd2html.
# esd2html stands for Embedded Source Documentation to(2) HTML.
# esd2html is GNU GPL Perl program available from
#
#                     http://www.isys.hu/c/verhas/progs/perl/esd2html
#
# To create the online HTML files you also need the SDD file along with this source file.
#
# The original location of this program is
#
#                     http://www.isys.hu/c/verhas/progs/perl/webmirror
#
# If you received this program from a different location visit the URL above to check
# for newer version.
#
# \today
#
=pod
=H mirror.pl

=toc
=cut
#

$version          = '2.45 pre-release';

@include          = (); # domains to include
@exclude          = (); # domains to exclude
$maximal_deepness = 3;  # how long should we follow links
$maximal_size     = 1024*1024; # max 1 MB
$pictures         = 1;  # download pictures
$allpictures      = 0;  # get all pictures even those which are out of the included domain
$allframes        = 0;  # include html pages that are part of a frame even if they are out of the included domain
$picpriority      = 0;  # treat pictures as ordinary page (do not rush for them)
$create_leafs     = 1;  # create redirect files for pages that are too deep
$regexps_are_perl = 0;  # patterns are normal, * is wild car and that is all
$use_full_url     = 1;  # use full URL on GET even without proxy

if( $#ARGV == -1 ){
  &print_help;
  exit;
  }

while( $cmdlinearg = shift @ARGV ){
  if( $cmdlinearg eq '-f' ){
    &process_command_file(shift);  
    }
  elsif( $cmdlinearg eq '-s' ){
    $log_stdout_on = 0;
    }
  elsif( $cmdlinearg eq '-h' ){
    &print_help;
    exit;
    }
  else{
    print STDERR "invalid option $cmdlinearg\n";
    }
  }

$current_size = 0; #downloaded bytes so far

while( $page = webpage->next_page ){
  last if $maximal_size && $current_size >= $maximal_size;
  next if $page->fetch_state eq 'SAVED';

  my $page_level = $page->level;
  if( $maximal_deepness && $page_level > $maximal_deepness ){
    $url = $page->url;
    $page->{'HEADER'}->{'content-type'} = 'text/html';
    $page->file;
    $page->text_leaf;
    $page->create_file if $create_leafs;
    $page->destroy;
    next;
    }

redirected:
  $page->calculate_auth;

authretry:

  $page->get;

  my $status = $page->{'STATE'};

  if( $status == 502 || # Service temporarily overloaded
      $status == 503 || # Gateway timeout
      $status == 603    # Timed out
    ){
    $page->fetch_state('FRESH');
    $page->schedule;
    next;
    }

  if( $status == 401 &&               # failed password
      $page->auth_type eq 'basic' &&  # authentication is basic (we can not do anything else recently)
      defined($page->{'AUTHLIST'}) && # we have username:password list defined
      $#{$page->{'AUTHLIST'}} >= 0 ){ # and have at least one members left to try
    $page->fetch_state('SCHEDULED');  # 'sub get' is lasy and said it was already retrieved,
                                      # but it is not the way like 'sub get' thinks. 'sub get' should try again!
    goto authretry;                   # try again with the next username:password
    # do you know that all listed usernames and password will be tried for
    # each page retrieved in the domain? It does not learn from previous
    # retrievals. It tries and tries all passwords, so do not be crasy and define only
    # one auth for a domain!
    }

  if( ( $status >= 400 && $status <= 404 ) ||
        $status == 501 ||
      ( $status >= 600 && $status <= 602 ) ){
    $page->text_badstate;
    $page->create_file if $create_leafs;
    $page->destroy;
    next;
    }

  if( $status == 301 || $status == 302 ){
    my $URL = $page->{'HEADER'}->{'location'};
    my $file;
    my $redir;
    my $follow_redirect;
    if( $follow_redirect = &can_get( $URL ) ){
      $redir = new webpage($URL);
      webpage->log("REDIRECT to $URL");
      $redir->schedule;
      $file = $page->relate($redir);
      }else{ $file = $URL; }# this is an absolute URL
    $page->text_redirect( $file );
    $page->create_file;
    $page->destroy;
    next unless $follow_redirect;
    $page = $redir;
    goto redirected;
    }

  if( $status >= 200 && $status < 300 ){
    $current_size += $page->length;
    if( $maximal_size && $current_size > $maximal_size ){
      webpage->log('QUOTA retrieving '.$page->url.' process exceeded.');
      webpage->log('QUITting download loop.');
      last; # quit from download loop
      }
    if( $page->content_type eq 'text/html' ){
      $page->split_html;
      delete $page->{'CONTENT'}; # we delete and the rebuild the content
      for $tag ( @{$page->{'SPLIT'}} ){
        next if $tag->{'TYPE'} eq 'TEXT';
        my $tagtype = $tag->{'CONTENT'}->[0];
        my $tags = $tag->{'CONTENT'}->[1];

        #   <A HREF="....." >

        if( ('a' eq $tagtype || 'area' eq $tagtype) && defined($tags->{'href'}) ){
          my $href = $page->urel2abs($tags->{'href'});
          my $hrefa = $href; # url with optional anchor
          $href =~ s/\#([^\/]*)$//; # delete anchor
          my $anchor = $1;
          if( &must_get( $href ) ){
            my $tpage = new webpage($href);
            $tpage->level($page_level + 1);
            $tpage->schedule;
            $tags->{'href'} = $page->relate($tpage); # point to the copied url
            $tags->{'href'} .= '#' . $anchor if $anchor;
            }else{
            $tags->{'href'} = $hrefa; # point absolute to the external, not copied url
            }
          }

        # <META HTTP-EQUIV="Refresh" CONTENT="1; URL=http://....">

        elsif( 'meta' eq $tagtype && lc($tags->{'http-equiv'}) eq 'refresh' ){
          next unless $tags->{'content'} =~ /(\d+)\;\s*url\=(.*)/i;
          my $delay = $1;
          my $href = $page->urel2abs($2);
          my $hrefa = $href; # href with optional anchor
          $href =~ s/\#[^\/]*$//; # delete anchor
          if( &must_get( $href ) ){
            my $tpage = new webpage($hrefa);
            $tpage->level($page_level + 1);
            $tpage->schedule;
            $tags->{'content'} = "$delay; URL=" . $page->relate($tpage); # point to the copied url
            }else{
            $tags->{'content'} = "$delay; URL=$href"; # point absolute to the external, not copied url
            }
          }

        #   <IMG SRC="....." >

        elsif( 'img' eq $tagtype && $pictures && defined($tags->{'src'}) ){
          my $href = $page->urel2abs($tags->{'src'});
          my $hrefa = $href; # href with optional anchor
          $href =~ s/\#[^\/]*$//; # delete anchor
          if( ( $allpictures && &can_get($href) ) || &must_get( $href ) ){
            my $tpage = new webpage($hrefa);
            $tpage->level($page_level);
            $tpage->schedule;
            $tags->{'src'} = $page->relate($tpage);
            }
          }

        #   <FRAME SRC="....." >
        elsif( ('frame' eq $tagtype || 'iframe' eq $tagtype) && defined($tags->{'src'}) ){
          my $href = $page->urel2abs($tags->{'src'});
          my $hrefa = $href; # href with optional anchor
          $href =~ s/\#([^\/]*)$//; # delete anchor
          my $anchor = $1;
          if( ( $allframes && &can_get($href) ) || &must_get( $href ) ){
            my $tpage = new webpage($hrefa);
            $tpage->level($page_level);
            $tpage->schedule;
            $tags->{'src'} = $page->relate($tpage);
            $tags->{'src'} .= '#' . $anchor if $anchor;
            }
          }

        #   <FORM ACTION="....." >
        # we do not follow form actions, but the url should be converted to absolute
        elsif( 'form' eq $tagtype && defined($tags->{'action'}) ){
          my $href = $page->urel2abs($tags->{'action'});
          $tags->{'action'} = $href; # point absolute to the external, not copied url
          }

        }
      $page->rebuild_content;
      }
    $page->create_file;
    $url_state{$page->url} = 'DONE';
    }
  }

exit;

=pod
=section can_get
=H can we get a page?
This function gets an URL and decides if the system can get it or not.

Usage:
=verbatim
               &can_get($url)
=noverbatim

The function returns true if the url can be retrieved using http. If this is a mailto:
ftp://, https:// or other protocol, then it return zero.
=cut
sub can_get {
  my $url = shift;

  return 1 unless $url =~ m{^\w+:}; # relative url is fine
  return 1 if $url =~ m{http://}; # http is fine
  return 0;
  }

=pod
=section must_get
=H should we get a page?
This function gets an URL and decides if the system must get it or not.

Usage:
=verbatim
               &must_get($url)
=noverbatim

The function returns true if the url is to be retrieved by the include and
exclude rules. It does not check page level or maximal retrieval size.

All pages that T<can_get> says can not be got T<must_get> returns zero.
=cut
sub must_get {
  my $url = shift;

  return 0 unless &can_get($url);

  $url = webpage->normalize_url($url);

  my $pattern;
  my $y=0;
  for $pattern ( @include ){
    if( $url =~ /$pattern/ ){
      $y = 1;
      last;
      }
    }

  return 0 unless $y;

  for $pattern ( @exclude ){
    return 0 if $url =~ /$pattern/;
    }
  return 1;
  }

=pod
=section print_help
=H print help on the screen

=cut
sub print_help {
  print <<END__HELP;
WEBMIRROR PRO $version
Usage: 
       webmirror [-s] -f retrieval_definition_file
or
       webmirror [-s] -f STDIN

to read the retrieval definition from the standard input.

Use -s to supress STDOUT logging.

For further information see the product documentation at
                http://www.isys.hu/c/verhas6progs/perl/webmirror
END__HELP
  }

=pod
=section process_command_file
=H Processing rdf file

This function processes a retrieval definition file.

=cut
sub process_command_file {
  my $command_file = shift;
  my $lines;

  if( $command_file eq 'STDIN' ){
    my $oldsep = $/; undef $/;
    $lines = <STDIN>;
    $/ = $oldsep;
    close STDIN; # no reason to keep it open
    }else{
    open(F,"<$command_file") or die "Can not open command file $command_file";
    my $oldsep = $/; undef $/;
    $lines = <F>;
    $/ = $oldsep;
    close F;
    }
  while( $lines =~ /^(config\s+.*)$/m ){
    my $includef = $1;
    $include = quotemeta $includef;
    $includef =~ s/^config\s*//;
    open(F,"<$includef") or die "Can not open command file $includef";
    my $oldsep = $/; undef $/;
    my $inclines = <F>;
    $/ = $oldsep;
    close F;
    $lines =~ s/$include/$inclines/mg;
    }
  my @lines = split(/\n+/ , $lines);
  $lines = '';
  for( @lines ){
    my $command;
    my $parameter;

    chomp;
    s/^\s*//;s/\s*$//; # delete leading and trailing spaces
    next if /^\s*$/;# ignore empty lines
    next if /^\s*\#/; #ignore comment lines

    if( /(\w+)\s+(.*)/ ){
      $command = $1;
      $parameter = $2;
      }
    elsif( /(\w+)/ ){
      $command = $1;
      $parameter = '';
      }
    else{
      print STDERR "line '$_' ignored. Syntax error.\n";
      next;
      }

    if( $command eq 'log' ){
      $webpage::log_file = $parameter;
      next;
      }

    if( $command eq 'map' ){
      $webpage::map_method = $parameter;
      next;
      }

    if( $command eq 'start' ){
      $parameter = 'http://' . $parameter unless $parameter =~ m{^http://};
      my $page = new webpage( $parameter );
      $page->schedule;
      $page->level(1);
      next;
      }

    if( $command eq 'include' ){
      $parameter = 'http://' . $parameter unless $parameter =~ m{^http://};
      $parameter = &regexp_it($parameter);
      push @include , $parameter;
      next;
      }

    if( $command eq 'regexp' ){
      if( $parameter eq 'perl' ){
        $regexps_are_perl = 1;
        next;
        }elsif( $parameter eq 'normal' ){
        $regexps_are_perl = 0;
        next;
        }
      }

    if( $command eq 'exclude' ){
      $parameter = 'http://' . $parameter unless $parameter =~ m{^http://};
      $parameter = &regexp_it($parameter);
      push @exclude , $parameter;
      next;
      }

    if( $command eq 'level' ){
      if( $parameter eq 'inf' ){
        $maximal_deepness = 0;
        }else{
        $maximal_deepness = $parameter;
        }
      next;
      }

    if( $command eq 'directget' ){
      if( $parameter eq 'partial' ){
        $use_full_url = 0;
        }
      if( $parameter eq 'full' ){
        $use_full_url = 1;
        }
      next;
      }

    if( $command eq 'size' ){
      if( $parameter eq 'inf' ){
        $maximal_size = 0;
        next;
        }
      $maximal_size = &prep_postf($parameter);
      next;
      }

    if( $command eq 'directory' ){
      webpage->cwd($parameter);
      next;
      }

    if( $command eq 'allpictures' ){
      $allpictures = 1;
      next;
      }

    if( $command eq 'allframes' ){
      $allframes = 1;
      next;
      }

    # try this first without authentication
    # noauth http://*.mydomain.com
    if( $command eq 'noauth' ){
      $parameter = &regexp_it($parameter);
      push @webpage::noauth_domain , { 'DOMAIN' => $domain ,
                                       'REALM'  => '.*' ,
                                       'AUTHS'  => '' , # this means w/o authentication
                                      };
      next;
      }

    # auth http://www.mydomain.com/* realm username:password
    if( $command eq 'auth' ){
      $parameter =~ /(\S+)\s*(.*)$/;
      my $domain = &regexp_it($1);
      $parameter = $2;
      $parameter =~ /(\S+)\s*(.*)$/;
      my $realm  = &regexp_it($1);
      $parameter = webpage->base64($2);
      push @webpage::auth_domain , { 'DOMAIN' => $domain ,
                                     'REALM'  => $realm ,
                                     'AUTH'  => $parameter ,
                                    };
      next;
      }

    if( $command eq 'pictures' ){
      $pictures = 1;
      if( $parameter eq 'first' ){
        $picpriority = 1; # schedule img url to the start of the schedule list
        }else{
        $picpriority = 0; # schedule img url to the end of the schedule list
        }                 # just like html pages
      next;
      }

    if( $command eq 'nopictures' ){
      $pictures = 0;
      $picpriority = 0; # useless but set it to default
      next;
      }

    if( $command eq 'leafs' ){
      $create_leafs = 1;
      next;
      }

    if( $command eq 'noleafs' ){
      $create_leafs = 0;
      next;
      }

    if( $command eq 'proxy' ){
      if( $parameter =~ /(.+)\s+(.+)/ ){
        my $proxy = $1;
        my $domain = $2;
        $domain = &regexp_it($domain);
        webpage->define_proxy( $proxy , $domain );
        }else{
        webpage->define_proxy( $proxy );
        }
      next;
      }

    if( $command eq 'interface' ){
      if( $parameter =~ /(.+)\s+(.+)/ ){
        my $if = $1;
        my $pattern =  &regexp_it($2);
        webpage->define_interface( $if , $pattern );
        }else{
        webpage->define_interface( $parameter , '.*' );
        }
      next;
      }

    if( $command eq 'pagesizelimit' ){
      if( $parameter eq 'inf' ){
        $webpage::pagesizelimit = 0;
        next;
        }
      $webpage::pagesizelimit = &prep_postf($parameter);
      next;
      }

    if( $command eq 'unbelieve' ){
      if( lc($parameter) eq 'content-length' ){
        $webpage::cl_believe = 0;
        next;
        }
      }
    if( $command eq 'believe' ){
      if( lc($parameter) eq 'content-length' ){
        $webpage::cl_believe = 1;
        next;
        }
      }

    if( $command eq 'agent' ){
      $webpage::USERAGENT = $parameter;
      next;
      }

    if( $command eq 'cookie' || $command eq 'cookies' ){
      if( $parameter eq 'no' ){
        $webpage::do_cookies = 0;
        next;
        }
      if( $parameter eq 'yes' ){
        $webpage::do_cookies = 1;
        next;
        }
      if( $parameter eq '3dots' ){
        $webpage::count_cookie_dots = 2;
        next;
        }
      if( $parameter eq 'dots' ){
        $webpage::count_cookie_dots = 1;
        next;
        }
      if( $parameter eq 'nodots' ){
        $webpage::count_cookie_dots = 0;
        next;
        }
      if( $parameter eq 'domain' ){
        $webpage::care_cookie_domain = 1;
        next;
        }
      if( $parameter eq 'nodomain' ){
        $webpage::care_cookie_domain = 0;
        next;
        }
      if( $parameter =~ /save\s+session/ ){
        $webpage::save_session_cookies = 1;
        next;
        }
      if( $parameter =~ /don\'?t\s+save\s+session/ ){
        $webpage::save_session_cookies = 0;
        next;
        }
      if( $parameter =~ /backup\s+(\d+)/ ){
        $webpage::cookie_file_backup_nr = $1;
        next;
        }
      if( $parameter =~ /file\s+(.*)/ ){
        $webpage::cookie_file = $1;
        webpage->load_cookies($webpage::cookie_file);
        next;
        }
      if( $parameter =~ /load\s+(.*)/ ){
        webpage->load_cookies($1);
        next;
        }
      }

    print STDERR "line '$_' ignored\n";
    }
  }

=pod
=section prep_postf
=H Prepare a size parameter

Usage:
=verbatim
          $p = &prep_postf($p);
=noverbatim

Checks if there is any T<K> or T<M> postfix after the size parameter,
and converts the numeric value. As an example it returns T<1024> for
T<1K>, or T<1024*1024> for T<1M>.

=cut
sub prep_postf {
  my $p = shift;

  if( $p =~ s/M$// ){ $p *= 1024*1024 }
  elsif( $p =~ s/K$// ){ $p *= 1024 }
  return $p;
  }

=pod
=section regexp_it
=H Converts a pattern to a regular expression

Usage:
=verbatim
             $regexp = &amp;regexp_it("wildchar pattern*");
=noverbatim

Domains in the config file can be given as wild charactered strings.
This routine converts these strings to Perl regular expression.

This is nothing else then T<quotemeta> all the string and convert the
wild character T<*> toT<.*>

Some user may like to specify Perl regular expressions saying

=verbatim
             regexp perl
=noverbatim

in the RDF file. The effect of that command is that the global variable
T<$regexps_are_perl> is set. When that variable is true this function
becomes transparent and returns the strings as they came is.

=cut
sub regexp_it {
  my $string = shift;

  return $string if $regexps_are_perl;

  $string = quotemeta $string;
  $string =~ s/\\\*/.*/g; #make * the joker character
  return $string;
  }

=pod
=section webpage
=H PACKAGE WEBPAGE
=abstract
This package/class implements the web page functionality. A web page is stored in the memory,
and is usually prepared for download, downloaded, prepared to be saved and saved.
=end
This package/class implements the web page functionality. A web page is stored in the memory,
and is usually prepared for download, downloaded, prepared to be saved and saved.

A T<webpage> object is a page which is downloaded from a URL. One page object has one URL and
one URL is assigned to a page object. Sometimes the same URL returns different pages, but
somewhere implementation has to stop.

There are many things that the program can do with such an objects. See the methods.

The only thing which is not really object oriented is the cookie handling. Cookies are
stored in the array T<@cookies> which belong to the package T<webpage>. But you can not
create different cookie realms for different downloads. There is only a single
T<@cookies> array.
=cut
package webpage;

BEGIN {
  $umask = 0777;
  $USERAGENT = 'my useragent 1.0';

  $map_method = 'simple'; # by default we do not create sub directories like com/digital/www/80/...
  $map_counter = 1; # file name counter when mapping is 'flat'
  $object_hash = {}; # containing all objects created
  @schedule_list = ();

  %host_ip = (); # store the ip addresses for hosts already accessed
  %ip_host = (); # and store the reverse

  @proxies          = (); # proxies to use
  $save_directory   = '.';# where to store the result
  $log_file         = ''; # where to write the log
  $log_opened       = 'NOT YET';
  $log_stderr_on    = 0;  # we do not send log messages to stderr by default
  $log_stdout_on    = 1;  # we        send log messages to stdout by default

  $pagesizelimit    = 0;  # no page size limit by default
  $cl_believe       = 1;  # do believe reported content length
  @auth_domain      = (); # list of domains that authentication was defined
  @noauth_domain    = (); # list of domains that should be first tried w/o authentication

  $interfaces_defined = 0;
  my $hostname = `hostname`;
  chomp $hostname;
  @interfaces = ($hostname);
  %interfaces = ( $hostname => '.*' );

  @mdays   = ( '31', '28', '31', '30', '31', '30', '31', '31', '30', '31', '30', '31' );

  $do_cookies = 1; # process cookies

  @cookies = (); # each item is a pointer to a hash of { NAME => ?? , VALUE => ??,
                 #                                       EXPIRES => ??, PATH => ?? ,
                 #                                       DOMAIN => ??, SECURE => 0|1 }

  $count_cookie_dots = 1; # 0 = do not care how many dots there are
                          # 1 = request at least two dots (default)
                          # 2 = request two or three dots (according to the spec, but it is crasy! *.mydom.hu ??)

  $care_cookie_domain = 1;# check that the indicated domain fits the sender (i.e. www.cnn.com should not
                          # send a cookie with the domain .bbc.com)

  $save_session_cookies = 0; # do not save the cookies that have no expiration time

  $cookie_file_backup_nr = 100; # how many backup files to keep from old cookie files with names:
                                # cookie.txt.00 ,cookie.txt.01 ,cookie.txt.02 ,cookie.txt.03 ...


  $cookie_file = ''; # where to save the cookies

  %mime_extension = (
'application/x-gzip' => ['gz'],
'application/x-compress' => ['Z'],
'applicarion/x-ns-proxy-autoconfig' => ['pac'],
'application/x-javascript' => ['js','ls','mocha'],
'application/x-tcl' => ['tcl'],
'application/x-sh' => ['sh'],
'application/x-csh' => ['csh'],
'application/postscript' => ['ai','eps','ps'],
'application/octet-stream' => ['exe','~.*'], # match all extensions
'application/x-cpio' => ['cpio'],
'application/x-gtar' => ['gtar'],
'application/x-tar' => ['tar'],
'application/x-shar' => ['shar'],
'application/x-zip-compressed' => ['zip'],
'application/x-stuffit' => ['sit'],
'application/mac-binhex40' => ['hqx'],
'video/x-msvideo' => ['avi'],
'video/quicktime' => ['qt','mov'],
'video/mpeg' => ['mpeg','mpg','mpe'],
'audio/x-wav' => ['wav'],
'audio/x-aiff' => ['aif','aiff','aifc'],
'audio/basic' => ['au','snd'],
'application/fractals' => ['fif'],
'image/ief' => ['ief'],
'image/x-MS-bmp' => ['bmp'],
'image/x-rgb' => ['rgb'],
'image/x-portable-pixmap' => ['ppm'],
'image/x-portable-graymap' => ['pgm'],
'image/x-portable-bitmap' => ['pbm'],
'image/x-portable-anymap' => ['pnm'],
'image/x-xwindowdump' => ['xwd'],
'image/x-xpixmap' => ['xpm'],
'image/x-xbitmap' => ['xbm'],
'image/x-cmu-raster' => ['ras'],
'image/tiff' => ['tiff','tif'],
'image/jpeg' => ['jpeg','jpg','jpe'],
'image/gif' => ['gif'],
'application/x-fexinfo' => ['texi'],
'application/x-fexinfo' => ['texinfo'],
'application/x-dvi' => ['dvi'],
'application/x-latex' => ['latex'],
'application/x-tex' => ['tex'],
'application/rtf' => ['rtf'],
'text/html' => ['html','htm'],
'text/plain' => ['txt','text'],
   );

  }

END {
  if( $cookie_file ){
    webpage->save_cookies($cookie_file);
    }
  }

=pod
=section calculate_auth
=H Calculate the authentication string to be used for a page

Usage:
=verbatim
                $page->calcuate_auth
=noverbatim

A page can be retrieven with or without authentication string. The RDF file
defines how to retrieve the pages. This method examines the url of the page
and creates an array containing all the authentication information that
should be tried to download the page.

=cut
sub calculate_auth {
  my $self = shift;

  return if defined $self->{'AUTHLIST'};

  for( @noauth_domain ){
    if( $self->url =~ /$_->{'DOMAIN'}/ ){
      $self->{'AUTHLIST'} = [ { 'REALM' => '.*' , 'AUTH' => '' } ];
      last;# if one mathces it is enough. we won't try several times w/o password only once
      }
    }

  $self->{'AUTHLIST'} = [] unless defined $self->{'AUTHLIST'};


  for( @auth_domain ){
    if( $self->url =~ /$_->{'DOMAIN'}/ ){
      push @{$self->{'AUTHLIST'}} , { 'REALM' => $_->{'REALM'} , 'AUTH' => $_->{'AUTH'} };
      }
    }
  }

=pod
=section new
=H create a new page object

This method either creates a new object for the url given
as parameter or returns the reference to the object if an object
was already created for the url.

Usage:
=verbatim
           $page = new webpage ($url);
=noverbatim

=cut
sub new {
  my $class = shift;
  my $url = shift;

  $url =~ s/\#[^\/]*$//; # delete anchor if it exists (page never has an anchor)

  if( $object_hash{$url} ){# return the object if it was already created
    return $object_hash{$url};
    }
  my $self = {};
  bless $self,$class;
  $url = 'http://' . $url unless $url =~ m{^http://};
  $self->{'URL'} = $url;
  $self->fetch_state('FRESH');
  $object_hash{$url} = $self;
  return $self;
  }

=pod
=section log
=H Issue a log intem into the log file

This function creates an item into the log file.

Usage:
=verbatim
           webpage->log('log message');
=noverbatim

=cut
sub log {
  shift;
  my $text = shift;
  my $now = time();
  my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst) = localtime($now);
  $year +=1900;
  $mon++;
  $mon  = '0' . $mon  if $mon  < 10;
  $mday = '0' . $mday if $mday < 10;
  $sec  = '0' . $sec  if $sec  < 10;
  $hour = '0' . $hour if $hour < 10;
  $min  = '0' . $min  if $min  < 10;

  $text = "$year.$mon.$mday $hour:$min:$sec $text";
  print STDERR "$text\n" if $log_stderr_on;
  print STDOUT "$text\n" if $log_stdout_on;
  return unless $log_file;
  return if $log_opened eq 'FAILED';
  my $fom = '>';

  $fom .= '>' unless $log_opened eq 'NOT YET';

  if( ! open(LOG,"$fom$log_file") ){
    $log_opened = 'FAILED';
    print STDERR "Log file $log_file can not be opened.\n";
    return;
    }else{ $log_opened = 'YES' }

  print LOG "$text\n";
  close LOG;
  
  return;
  }

=pod
=section define_proxy
=H Define a new proxy server

This class method is called during configuration. This
simpply defined a new proxy server to be used for a certain
domain pattern.

Usage:

=verbatim
              webpage->define_proxy("http://proxy.myc.com" , "http://mydomain*" )
=noverbatim
=cut
sub define_proxy {
  shift; # class method
  my $proxy = shift;
  my $domain = shift;

  $domain = '.*' unless defined $domain;

  push @proxies , { 'PROXY' => $proxy , 'DOMAIN' => $domain };
  }

=pod
=section save_cookies
=H Save all the cookies to a cookies file

Usage:
=verbatim
            webpage->save_cookies("fileName");
=noverbatim

This class method saves the content of the global array T<@cookies> into
the file T<fileName> given as argument.

Each line of the file will contain one cookie in the form:
=verbatim
NAME=VALUE; expires=DATE;path=PATH; domain=DOMAIN_NAME; secure
=noverbatim

=cut
sub save_cookies {
  shift; #class
  my $file = shift;

  # create backup files
  if( $cookie_file_backup_nr && -e $file ){
    my $i;
    for $i ( 0 ... $cookie_file_backup_nr-1 ){
      $i = ('0' x (length($cookie_file_backup_nr-1)-length($i)) ) . $i;
      if( ! -e "$file.$i" ){
        if( rename $file , "$file.$i" ){
          $i++;
          $i = 0 if $i == $cookie_file_backup_nr;
          $i = ('0' x (length($cookie_file_backup_nr-1)-length($i)) ) . $i;
          unlink "$file.$i";
          last;
          }
        }
      }
    }

  my $timeNow = time(); # expired cookies are not saved

  if( ! open($file,">$file") ){
    webpage->log("ERROR The cookie file $file was not saved.");
    return;
    }
  for $this_cookie ( @cookies ){
    if(  ( $save_session_cookies && $this_cookie->{'EXPIRES_CONVERTED'} == -1 ) || 
         $this_cookie->{'EXPIRES_CONVERTED'} > $timeNow ){ # not expired yet
      print $file $this_cookie->{'NAME'},'=',$this_cookie->{'VALUE'};
      if( $this_cookie->{'EXPIRES_CONVERTED'} != -1 ){
        print $file '; expires=',$this_cookie->{'EXPIRES'};
        }
      print $file '; path=',$this_cookie->{'PATH'};
      print $file '; domain=',$this_cookie->{'DOMAIN'};
      print $file '; secure' if $this_cookie->{'SECURE'};
      print $file "\n";
      }
    }
  close $file;
  }
=pod
=section load_cookies
=H Load all the cookies from the cookies file that was created calling save_cookies

Usage:
=verbatim
            webpage->load_cookies("fileName");
=noverbatim

This class method loads the content of the global array T<@cookies> from
the file T<fileName> given as argument.

The old content of the array is not deleted, so cookie files can be merged.

This method actually switches off cookie domain compliancy check and calls the
method T<set_cookie> using a dummy object for each line of the cookie file.
=cut
sub load_cookies {
  shift; #class
  my $file = shift;

  if( ! open($file,"<$file") ){
    webpage->log("ERROR the cookie file $file was not loaded.");
    return;
    }
  my $my_care_cookie_domain = $care_cookie_domain;
  my $my_count_cookie_dots = $count_cookie_dots;

  my $dummy = webpage->new('http://www.isys.hu/c/verhas'); # it is a dummy, it could be any url

  while( <$file> ){
    chomp;
    $dummy->set_cookie($_);
    }
  close $file;
  my $care_cookie_domain = $my_care_cookie_domain;
  my $count_cookie_dots = $my_count_cookie_dots;
  }

=pod
=section list_cookies
=H List all cookies that are to be sent with the http request

Usage:
=verbatim
             print "Cookie: ",$page->list_cookies,"\n";
=noverbatim

This method checks the global array T<@cookies> and creates a string
of the format
=verbatim
NAME1=OPAQUE_STRING1; NAME2=OPAQUE_STRING2 ...
=noverbatim
including all the cookies that fit the URL domain, path and is not expired yet.

=cut
sub list_cookies {
  my $self = shift;

  my $i,$j; # index variables for sorting cookies

  my $domain = $self->host;
  my $path   = $self->path;
  my $timeNow = time(); # expired cookies are not listed

  my $this_cookie;
  my @cookie_list = ();
  for $this_cookie ( @cookies ){
    my $domainpattern = '.*' . quotemeta($this_cookie->{'DOMAIN'}) . '$';
    my $pathpattern   = quotemeta($this_cookie->{'PATH'}) . '.*$'; 
    if(  $domain =~ /$domainpattern/ && $path =~ /$pathpattern/ && # domain and path is OK
         ($this_cookie->{'EXPIRES_CONVERTED'} == -1 || # there is no expiration defined
         $this_cookie->{'EXPIRES_CONVERTED'} > $timeNow ) ){ # or not expired yet
      push @cookie_list , $this_cookie;
      }
    }

  # simple buble sort to order the cookies so that the more
  # precise path comes first as it is required by the spec.
  for $i ( 0 .. $#cookie_list-1 ){
    for $j ( $i+1 .. $#cookie_list ){
      if( length($cookie_list[$i]->{'PATH'}) < length($cookie_list[$j]->{'PATH'}) ){
        my $swap = $cookie_list[$i];
        $cookie_list[$i] = $cookie_list[$j];
        $cookie_list[$j] = $swap;
        }
      }
    }

  my $list = '';
  for( @cookie_list ){
    $list .= '; ' if $list;
    $list .= $_->{'NAME'} . '=' . $_->{'VALUE'};
    }
  return $list;
  }

=pod
=section set_all_cookies
=H Set all cookies of a retrieved page

Usage:
=verbatim
           $page->set_all_cookies;
=noverbatim

This method calls the method R<set_cookie> method for each
cookie sent with the page and stores the cookies sent in the
global T<@cookies> array.

=cut
sub set_all_cookies {
  my $self = shift;

  if( defined $self->{'HEADER'}->{'set-cookie'} ){
    $self->set_cookie($self->{'HEADER'}->{'set-cookie'});
    }

  if( defined($self->{'HEADERc'}->{'set-cookie'}) &&
      $#{$self->{'HEADERc'}->{'set-cookie'}} > -1 ){
    my $thisccokie;
    for $thiscookie ( @{$self->{'HEADERc'}->{'set-cookie'}} ){
      $self->set_cookie($thiscookie);
      }
    }

  }

=pod
=section set_cookie
=H Set a new cookie or update an old one

Usage:
=verbatim
           $page->set_cookie("cookie string with options")
=noverbatim

This method sets a new cookie and modifies the T<@cookies>
global array.
=cut
sub set_cookie {
  my $self = shift;
  my $cookie_string = shift; # NAME=VALUE; expires=DATE;path=PATH; domain=DOMAIN_NAME; secure

  my ($expires,$path,$domain,$secure);
  my $i; # loop index variable going through the cookie array

  $cookie_string =~ s/\s*(.*?)\s*;//;
  my $name_value = $1;
  my ($name,$value) = split(/=/ , $name_value);
  my @params = split(/;/,$cookie_string);
  for( @params ){
    s/^\s*//; s/\s*$//;
    if( /^secure$/i ){
      $secure = 1;
      next;
      }
    if( /expires\s*=\s*(.*)/i ){
      $expires = $1;
      next;
      }
    if( /path\s*=\s*(.*)/i ){
      $path = $1;
      next;
      }
    if( /domain\s*=\s*(.*)/i ){
      $domain = $1;
      next;
      }
    }

  $path    = $self->path unless defined $path;    # path that the cookie comes from
  $secure  = 0           unless defined $secure;  # it is not secure by default
  if( defined $expires ){
    $expires_converted = webpage->cookieT2time($expires);
    }else{  $expires_converted =-1 } # expires by the end of session


  if( defined( $domain ) ){
    if( $care_cookie_domain ){
      my $domainpattern = '.*' . quotemeta($domain) . '$' ;
      if( $self->host !~ /$domainpattern/ ){
        webpage->log("ERROR cookie domain $domain has not fit sending url ", $self->url);
        return;
        }
      }
    }else{
    # default is the machine that sends the cookie and it obviously fits itself
    $domain  = $self->host;
    }

  if( $count_cookie_dots ){
    # calculate the number of dots in the domain parameter
    # the spec requires at least two or three
    my $dot_nr = length( $domain =~ /\./g );
    if( $count_cookie_dots == 2 ||
        $domain =~ /\.gov/i || 
        $domain =~ /\.edu/i || 
        $domain =~ /\.net/i || 
        $domain =~ /\.org/i || 
        $domain =~ /\.com/i || 
        $domain =~ /\.mil/i || 
        $domain =~ /\.int/i   ){
       $dot_nr -= 2;
       }else{ $dot_nr -= 3 }
    if( $dot_nr < 0 ){
      webpage->log("ERROR cookie domain $domain has not enough dots in it.");
      return;
      }
    }

  for $i ( 0 .. $#cookies ){

    if( $cookies[$i]->{'NAME'} eq $name && 
        $cookies[$i]->{'DOMAIN'} eq $domain && 
        $cookies[$i]->{'PATH'} eq $path ){
      $cookies[$i]->{'VALUE'} = $value;
      $cookies[$i]->{'EXPIRES'} = $expires;
      $cookies[$i]->{'EXPIRES_CONVERTED'} = $expires_converted;
      $cookies[$i]->{'SECURE'} = $secure;  # later version may support HTTPS
      return; # the old cookie is stored with the new value
      }
    }

  push @cookies , {
    'DOMAIN' => $domain,
    'PATH' => $path,
    'NAME' => $name,
    'VALUE' => $value,
    'EXPIRES' => $expires,
    'EXPIRES_CONVERTED' => $expires_converted ,
    'SECURE' => $secure # later version may support HTTPS
    };
  return; # the cookie was stored as a new value
  }

=pod
=section cookieT2time
=H Convert cookie expiration time to GMT time seconds

Usage:
=verbatim
             webpage->cookieT2time("cookie expiration string")
=noverbatim

Convert a time value given in the cookie to seconds as given by the
system function T<time()>. Expiration time in the cookies are 
given in the format T<Wdy, DD-Mon-YY HH:MM:SS GMT>

We do not care the T<Wdy> part as this is redundant, and do not care the
T<GMT> as that should always be T<GMT> and no other timezone is permitted.

This class method examines the string, splits it into parts and calculates the
seconds between January 1, 1970. and the time given by the string.

This is a class method.
=cut
sub cookieT2time {
  shift; #class
  my $cookie_time = shift; # of the format Wdy, DD-Mon-YY HH:MM:SS GMT

  # convert the string into separate values
  $cookie_time =~ /.*\,\s*(\d+)\-(\w+)\-(\d+)\s*(\d+):(\d+):(\d+)/i;
  my ($mday,$moname,$year,$hour,$min,$sec) = ($1,$2,$3,$4,$5,$6);

  # the stupid cookie time format is not Y2K
  # we do our best works from 1998 till 2098 and when year is given as YYYY
  if( length($year) == 2 ){
    if( $year eq '98' || $year eq '99' ){
       $year += 1900;
       }else{
       $year += 2000;
       }
    }# now we have a four digit year

  # get the serial number of the month
  $month = { 'jan' =>  0 , 'feb' =>  1 , 'mar' =>  2 , 'apr' =>  3 ,
             'may' =>  4 , 'jun' =>  5 , 'jul' =>  6 , 'aug' =>  7 ,
             'sep' =>  8 , 'oct' =>  9 , 'nov' => 10 , 'dec' => 11
           } -> { lc $moname };

  # count the days since the epoch (January 1, 1970.)
  my $ds = 0;
  for(1970 .. $year ){
    $ds += ((($_ % 4) == 0) && ( ($_ % 100) || (($_ % 400)==0) )) ? 366 : 365
    }

  # summ up the day of the months until the current
  for ( 0 .. $month-1 ){ $ds += $mdays[$_] ; }

  # add one if this is a leap year and we are after february
  if( ((($_ % 4) == 0) && ( ($_ % 100) || (($_ % 400)==0) )) && $month >1 ){ $ds++ }

  # summ up the days of the current months
  $ds += $day;

  # now convert all this stuff to seconds
  $ds *= 86400;

  # add hour, minute and seconds values
  $ds += $sec + 60*($min + 60*$hour);

  return $ds;
  }

=pod
=section normalize_url
=H Normalize a url

Get a url as an argument and check if an ip number is given instead of
a host name. If the ip number is specified then replace it with the host name
and return the resulting url.

This is a class method.

Usage:
=verbatim
                $url = webpage->normalize_url($url);
=noverbatim

Sometimes getting a page using symbolic host name the webserver answers a redirect to
an URL which is the same as the requested but having the machine ip number instead
of the symbolic name (reason?). If we used the the ip number instead of the symbolic
name we could easily get out of the download domain that the user usually defines
using the symbolic names. Therefore this method is called whenever the page is
going to be saved and whenever the url is check if it is in the domain or not.

This method is also used to decide if a cookie is to be sent to the server.
=cut
sub normalize_url{
  shift;
  my $url = shift;

  if( $url =~ m{^http://([\w-\.]+):?(\d*)(/.*)?} ){
    my $host = $1; #URL host
    my $port = $2; #URL port
    my $path = $3; #URL path
    if( $host =~ m{\d+\.\d+\.\d+\.\d+} ){ # this is an ip number
      $host = $ip_host{$host} if defined $ip_host{$host};
      my $ourl = $url;
      $url = 'http://' . $host;
      $url .= ':' . $port if $port;
      $url .= $path if $path;
      webpage->log("NORMALIZED $ourl to $url");
      return $url;
      }else{
      return $url;
      }
    }else{
    return $url;
    }
  }

=pod
=section host
=H Return the host of a page

Calculate the host of a page. This method is used in cookie handling. This
method uses the URL of a page and extracts the host from it. If this value
is numeric (like 193.68.44.152) it tries to see if this host is already
known by its name (calls the method T<normalize_url>).

As a side effect this method also calculates the R<path> value of a page.

Usage:
=verbatim
              $page->host;
=noverbatim
=cut
sub host {
  my $self = shift;

  if( ! defined $self->{'HOST'} ){
    webpage->normalize_url($self->url) =~ m{^http://([\w-\.]+):?(\d*)(/.*)?};
    $self->{'HOST'} = $1; #URL host
    $self->{'PATH'} = $3;
    if( $self->{'PATH'} =~ /\.(\d\w)+$/ ){ # path ends with file name
      $self->{'PATH'} =~ s{/[^/]*$}{/};    # delete the file name
      }else{
      $self->{'PATH'} .= '/' unless $self->{'PATH'} =~ m{/$};
      }
    }
  return $self->{'HOST'};
  }

=pod
=section path
=H Return the path of a page

Calculate the path of a page. This method is used in cookie handling. This
method uses the URL of a page and extracts the path from it.

Usage:
=verbatim
              $page->path;
=noverbatim

If the path value was not calculated before this method invokes the method R<host>
to calculate the host and the path values as the same time.
=cut
sub path {
  my $self = shift;

  if( ! defined $self->{'PATH'} ){
    $self->host; # calculates also PATH
    }
  return $self->{'PATH'};
  }

=pod
=section url
=H Return the url of a page

This is just a stub to get the url of the page.

Usage:
=verbatim
              $page->url;
=noverbatim
=cut
sub url {
  my $self = shift;

  return $self->{'URL'};
  }

=pod
=section define_interface
=H Define a new interface

This class method is called during configuration. This method defines
a new interface which should be used during download. The parameters of the
method define the IP name of the interface and a pattern for which the
interface is to be used.

Usage:
=verbatim
              webpage->define_interface($interface,$pattern);
=noverbatim
=cut
sub define_interface {
  shift; # class method
  my $if = shift;
  my $pattern = shift;

  if( ! $interfaces_defined ){
    @interfaces = ();
    %interfaces = ( );
    $interfaces_defined = 1;
    }
  push @interfaces , $if;
  $interfaces{$if} = $pattern;
  }

=pod
=section fetch_state
=H Set or get the fetch_state of a page

Usage:
=verbatim
           $page->fetch_state

or
           $page->fetch_state($state)
=noverbatim

During the retrieval process a page can have the follwing states:

=itemize
=item T<FRESH> the page object was generated, but nothing has beed done to the page
=item T<SCHEDULED> the page object was scheduled for retrieval
=item T<RETRIEVED> the page object was retrieved sucessfully or unsuccessfully via a
http GET method
=item T<SAVED> the page was saved to the disk and the content was destroyed (see R<destroy>).
=noitemize

=cut
sub fetch_state {
  my $self = shift;
  my $state = shift;

  $self->{'FETCH_STATE'} = $state if defined $state;
  return $self->{'FETCH_STATE'};
  }

=pod
=section content_type
=H Return the content type header information

Usage:
=verbatim
               $page->content_type
=noverbatim

This method actually returns a stripped content type that does not
include character code page or anything alike, which can stand after
a semicolon.
=cut
sub content_type {
  my $self = shift;

  if( ! defined $self->{'STRIPPED_CONTENT_TYPE'} ){
    $self->{'STRIPPED_CONTENT_TYPE'} = $self->{'HEADER'}->{'content-type'};
    $self->{'STRIPPED_CONTENT_TYPE'} =~ s/\s*;.*//;
    }
  return $self->{'STRIPPED_CONTENT_TYPE'};
  }

=pod
=section level
=H set or get the level of a page

Each page has a level which measures the distance of the page
from the starting page(s). The starting pages have level one. Each page
referenced by a page having level N has level N+1.

This method sets or retrieves the level of a page. If the level that
the argument specifies is higher than the level that the page already has
then the higher value is abandoned, because there is a shorter way
getting to this page from the start pages.

If the page has a level and the new level is smaller then the new level is
used.

Usage:
=verbatim
              $page->level
or
              $page->level($level)
=noverbatim

=cut
sub level{
  my $self = shift;
  my $level = shift;

  $self->{'LEVEL'} = $level
    if defined($level) && (!defined($self->{'LEVEL'}) || $self->{'LEVEL'} > $level);

  return $self->{'LEVEL'};
  }

=pod
=section schedule
=H Schedule a page to be retrieved

Put the page object on the schedule list and set its state to
T<SCHEDULED>.
Usage:
=verbatim
            $page->schedule
=noverbatim

=cut
sub schedule{
  my $self = shift;
  my $priority = shift;

  $priority = 0 unless defined $priority;
  return if $self->fetch_state ne 'FRESH';

  webpage->log('SCHEDULE ' . $self->url);
  push @schedule_list, { 'OBJECT' => $self , 'PRIORITY' => $priority };
  $self->fetch_state('SCHEDULED');
  }

=pod
=section next_page
=H Get the next page to be retrieved

Get the next page to be retrieved from the schedule list
and return the object.

Usage:
=verbatim
            $page->next_page
=noverbatim

returns the page object

=cut
sub next_page {
  my $sle = pop @schedule_list;

  return $sle->{'OBJECT'};
  }


=pod
=section destroy
=H Destroy a page

This method deletes the content and the split content of a page. This
can save up huge memory after a big GIF, JPG or HTML file was saved. The
page however is still alive, because other pages may reference it.

Usage:
=verbatim
              $page->destroy;
=noverbatim
=cut
sub destroy {
  my $self = shift;
  delete $self->{'CONTENT'}; # this is the most memory hungry part, but the rest is kept for reference
  delete $self->{'SPLIT'};
  }

=pod
=section cwd
=H Set current working directory

Sets the directory where the pages should be saved. This
directory is prepended to the relative directory string
calculated by R<file>.

Usage:
=verbatim
             webpage->cwd($directory)
=noverbatim

=cut
sub cwd {
  shift;
  my $cwd = shift;

  if( defined($cwd) ){
    $save_directory = $cwd;
    }
  return $save_directory;
  }

=pod
=section proxy
=H Set or retrieve the proxy

When a page has a proxy defined the method R<get> tries
to retirve it using the proxy. This method sets or
retrieves the proxy for the page

Usage:
=verbatim
              $page->proxy
or
              $page->proxy($proxy)
=noverbatim
=cut
sub proxy {
  my $self = shift;
  my $proxy = shift;

  $self->{'PROXY'} = $proxy if defined $proxy;
  $self->{'PROXY'};
  }


=pod
=section relate
=H Calculate relative URL

Pages A and B are retrieved, both are to be saved and
A has a reference to B.

The reference in A referencing B can be absolute or relative.
However after saving both of them the saved form of A B<should>
have a relative reference to the saved location of B.

This method calculates and returns the relative url from A to B.

Usage:
=verbatim
             $href = $pageA->relate($pageB)
=noverbatim

where both T<$pageA> and T<$pageB> are objects to pages.
=cut
sub relate {
  my $self = shift;
  my $rpage = shift;
  my $file = $rpage->file;
  my $return;
  my $this_file;

  my @base_path = split( m{/} , $self->file );
  my @rela_path = split( m{/} , $file );

  if( $#base_path > -1 && $base_path[$#base_path] =~ /\./ ){
    $this_file = pop @base_path; # pop off file name
    }else{
    $this_file = '';
    }

  while( 1 ){
    last if $#base_path == -1 || $#rela_path == -1 || $base_path[0] ne $rela_path[0] ;
    shift @base_path;
    shift @rela_path;
    }

  while( $#base_path > -1 ){
    shift @base_path;
    unshift @rela_path , '..';
    }

  $return = join( '/' , @rela_path );
  $return = '' if $this_file && $this_file eq $return;
  return $return;
  }

=pod
=section split_html
=H Split a HTML content into texts and tags array

An HTML content is viewed as an array of texts and tags.
This method takes the content of a page and creates an
array of texts and tags.

Usage:
=verbatim
             $page->split_html
=noverbatim

See also R<rebuild_content>.
=cut
sub split_html {
  my $self = shift;
  my $html = \$self->{'CONTENT'};
  my @tags = ();
  my $index,$cpos = 0;

  while( 1 ){
    my $text;

    $index = index( $$html, '<' , $cpos);
    if( $index > -1 ){
      $text = substr($$html,$cpos,$index-$cpos);
      }else{
      $text = substr($$html,$cpos);
      }
    if( $text ){
      push @tags , { TYPE     => 'TEXT' ,
                     CONTENT  => $text };
      }
    last if $index == -1;

    $index ++;# step over the '<'
    $cpos = index( $$html , '>' , $index );
    if( $cpos > -1 ){
      $text = substr($$html,$index,$cpos-$index);
      }else{
      $text = substr($$html,$index);
      }
    if( $text ){
      if( $text =~ /\!/ ){# this is a comment, and not a real tag
        # treat it as pure text
        push @tags , { TYPE    => 'TEXT' ,
                       CONTENT => "<$text>" };
#
# LATER WE SHOULD DEAL WITH <SCRIPT> </SCRIPT> TAGS AS TEXT AS WELL
# OR IMPLEMET A JAVASCRIPT ENGINE THAT CAN CALCULATE ALL THE POSSIBLE
# HREF VALUES THAT A JAVASCRIPT CAN CALCULATE (HA HA HA :-)
#
        }else{
        my @tagd = &tagdisa($text);
        push @tags , { TYPE    => 'TAG' ,
                       CONTENT =>  \@tagd };
        }
      }
    last if $cpos == -1;
    $cpos++;# step over the '>'
    }
  $self->{'SPLIT'} = \@tags;
  delete $self->{'CONTENT'};
  }

#
# Get a string containing a html tag and
# return a two element list. The first element of the list
# is the type of the tag (like A, B, P, H1). The second element is
# a reference to a hash. Each element contains the tag parameter name
# and value, like (HREF,http://www.mycom.com) for an A tag.
#
# THIS IS A FUNCTION NOT A METHOD USED ONLY INTERNALLY!!
sub tagdisa {
  my $tag = shift;
  my %params = ();

  $tag =~ s/^\s*//;         # delete leading spaces

  $tag =~ s/^(\/?\w+)//;    # get the type of the tag, NAME or /NAME
  my $type = lc $1;
  $tag =~ s/^\s*//;         # delete space after

  while( $tag ){
    $tag =~ s/^([^\s\=]+)//;         # get parameter name
    my $parn = lc $1;

    $tag =~ s/^\s*//;           # delete space after the parameter value

                                # if there is no =
    unless( $tag =~ s/^\=// ){  # then it is a parameter without value
      $params{$parn} = '';
      next;
      }

    $tag =~ s/^\s*//;           # delete space after the =

    my $parv = '';              # no parameter value so far
    if( $tag =~ s/^\"// ){      # starts with " like href="xxx"

      $tag =~ s/^([^"]*)//;     # anything until the trailing "
      $parv = $1;
      $tag =~ s/^\"//;          # delete the trailing "

      }else{                    # does not start with " like SIZE=3

      $tag =~ s/^(\S*)//;       # anything until space
      $parv = $1;

      }

    $tag =~ s/^\s*//;           # delete space after the parameter value

    $params{$parn} = $parv;
    }

  return ($type, \%params);
  }

=pod
=section rebuild_content
=H rebuild the content of a html file

After the html content was split (see R<split_html>) and the appropriate
references were modified the content should be rebuilt. This method
recreates the modified content from the split format.

Usage:
=verbatim
             $page->rebuild_content
=noverbatim
=cut
sub rebuild_content {
  my $self = shift;
  my $tag;

  $self->{'CONTENT'} = ''; # we delete and the rebuild the content
  for $tag ( @{$self->{'SPLIT'}} ){
    if( $tag->{'TYPE'} eq 'TEXT' ){
      $self->{'CONTENT'} .= $tag->{'CONTENT'};
      }else{
      my $tagtype = $tag->{'CONTENT'}->[0];
      my $tags = $tag->{'CONTENT'}->[1];
      $self->{'CONTENT'} .= "<$tagtype";
      while( ($tagkw,$tagv) = each %{$tags} ){
        $self->{'CONTENT'} .= " $tagkw";
        $self->{'CONTENT'} .= "=\"$tagv\"" if defined $tagv;
        }
      $self->{'CONTENT'} .= '>';
      }
    }
  delete $self->{'SPLIT'};
  }

=pod
=section urel2abs
=H Convert url to absolute

This method convert a relative url to absolute. This is needed for the
retrieval.

The argument of the method can be a relative url from a T<href>
or T<src> tag of the html of the page or can be absolute. The
returned url is an absolute url.

Usage:
=verbatim
            $absolute_url = $page->urel2abs($href_url);
=noverbatim
=cut
sub urel2abs {
  my $self = shift;
  my $relURL  = shift; # relative or absolute URL
  my $baseURL = $self->{'URL'};

  my $i,$return;

  return $relURL if $relURL =~ m{^\w+:};

  $baseURL = 'http://' . $baseURL unless $baseURL =~ m{^http://}i;

  $baseURL =~ m{^http://([\d\w-\.]+):?(\d*)(/.*)?(\#.*)?}i;
  my ($baseHost, $basePort, $basePath, $baseAnchor) = ( $1, $2, $3, $4);

  $basePath =~ s{^/}{};

  if( $relURL =~ m{^/} ){
    $return = 'http://' . $baseHost;
    $return .= ":$basePort" if $basePort ;
    $return .= $relURL;
    return $return;
    }

  if( $relURL =~ m{^\#} ){
    $return = 'http://' . $baseHost;
    $return .= ":$basePort" if $basePort ;
    $return .= "/$basePath";
    $return .= $relURL;
    return $return;
    }

  my @relList = split( m{/} , $relURL );
  my @absList = split( m{/} , $basePath );

  # trailing file name should be thrown away
  if( $#absList > -1 ){
    my $fileName = pop @absList;
    push @absList , $fileName unless $fileName =~ m{\.};
    }

  push @absList , @relList;

  @relList = ();

  while( $i = shift @absList ){
    if( $i eq '..' ){
      pop @relList;
      }else{
      push @relList , $i;
      }
    }
  $relURL = '/' . join( '/' , @relList );
  $return = 'http://' . $baseHost;
  $return .= ":$basePort" if $basePort ;
  $return .= $relURL;
  return $return;
  }

=pod
=section state_message
=H return the page state message

Returns the English language message of the HTTP state of the page.
These are like "OK 200" or "404 Not found" to mention the most well
known examples.

There are also states that may come from the socket handling process,
when retrieval can not get the page.

Usage:
=verbatim
           print $page->state_message
=noverbatim
=cut
sub state_message {
  my $self = shift;
my %status_messages = (
  -1,  "Could not lookup server",
  -2,  "Could not open socket",
  -3,  "Could not bind socket",
  -4,  "Could not connect",
  -5,  "Retrieved file small",
  -6,  "Save file cannot be opened",
  -7,  "Page is too big.",

  200, "OK 200",
  201, "CREATED 201",
  202, "Accepted 202",
  203, "Partial Information 203",
  204, "No Response 204",
  301, "Found, but moved",
  302, "Found, but data resides under different URL (add a /)",
  303, "Method",
  304, "Not Modified",
  400, "Bad request",
  401, "Unauthorized",
  402, "PaymentRequired",
  403, "Forbidden",
  404, "Not found",
  500, "Internal Error",
  501, "Not implemented",
  502, "Service temporarily overloaded",
  503, "Gateway timeout ",
  600, "Bad request",
  601, "Not implemented",
  602, "Connection failed (host not found?)",
  603, "Timed out",
);
  return undef unless defined $self->{'STATE'};
  $self->{'STATE_MESSAGE'} = $status_messages{$self->{'STATE'}};
  }

=pod
=section get
=H get a page

Get a page using http GET.

Usage:
=verbatim
               $page->get
=noverbatim
=cut
sub get {
  my $self = shift;
  my $interface;
  my $buffer;

  return if $self->fetch_state eq 'RETRIEVED';

  my $URL  = $self->{'URL'};
  webpage->log("GET $URL");

  for( @proxies ){
    my $domain = $_->{'DOMAIN'};
    if( $self->url =~ m{$domain} ){
      $self->proxy($_->{'PROXY'});
      last;
      }
    }

  my $PROXY = $self->{'PROXY'};

  # split the URL to (host, port, path)
  if( $PROXY ){#using proxy

    $PROXY =~ m{(?:http://)?([\w-\.]+):?(\d*)};
    $host = $1;   #proxy host
    $port = $2;   #proxy port
    $path = $URL; #Using proxy the path is the full URL
    if ($port eq "") { $port = 8080; }

    }else{#if do not use proxy

    if ($URL =~ m#^http://([\w-\.]+):?(\d*)(/.*)?#) {
      $host = $1; #URL host
      $port = $2; #URL port
      if( $use_full_url ){
        $path = $URL; #send the full path
        }else{
        $path = $3; #URL path
        }
      }
    if ($path eq "") { $path = '/'; }
    if ($port eq "") { $port = 80; }

    }

  $AF_INET = 2;
  $SOCK_STREAM = 1;

  $sockaddr = 'S n a4 x8';

  if( $host_ip{$host} ){
    $thataddr = $host_ip{$host};
    }else{
    if (!(($name,$aliases,$type,$len,$thataddr) = gethostbyname($host))){
      $self->{'STATE'} = -1;
      return;
      }
    $host_ip{$host} = $thataddr;
    $ip_host{
      sprintf("%d.%d.%d.%d",ord(substr($thataddr,0,1)),
                              ord(substr($thataddr,1,1)),
                                ord(substr($thataddr,2,1)),
                                  ord(substr($thataddr,3,1)) ) } = $host
      unless $host =~ m{\d+\.\d+\.\d+\.\d+};
    }

  my $thataddr_ip_string = sprintf("%d.%d.%d.%d",ord(substr($thataddr,0,1)),
                              ord(substr($thataddr,1,1)),
                                ord(substr($thataddr,2,1)),
                                  ord(substr($thataddr,3,1)) );
  $interface = '';
  for( @interfaces ){
     if( $host =~ m{$interfaces{$_}} || $thataddr_ip_string =~ m{$interfaces{$_}} ){
       $interface = $_;
       last;
       }
     }
  if( ! $interface ){
    webpage->log("ERROR no interface defined for $host");
    webpage->log("QUITting");
    exit;
    }

  if( $host_ip{$interface} ){
    $thisaddr = $host_ip{$interface};
    }else{
    if (!(($name,$aliases,$type,$len,$thisaddr) = gethostbyname($interface))){
      webpage->log("ERROR can not bind local interface \"$interface\"");
      webpage->log("QUITting");
      $self->{'STATE'} = -1;
      return;
      }
    $host_ip{$interface} = $thisaddr;
    $ip_host{
      sprintf("%d.%d.%d.%d",ord(substr($thisaddr,0,1)),
                              ord(substr($thisaddr,1,1)),
                                ord(substr($thisaddr,2,1)),
                                  ord(substr($thisaddr,3,1)) ) } = $interface
      unless $interface =~ m{\d+\.\d+\.\d+\.\d+};
    }

  ($name,$aliases,$proto) = getprotobyname('tcp');

  $this = pack($sockaddr, $AF_INET, 0, $thisaddr);
  $that = pack($sockaddr, $AF_INET, $port, $thataddr);

  # Make the socket filehandle.
  if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))){
    $self->{'STATE'} = -2;
    $self->state_message;
    return;
    }

  # Give the socket an address
  if (!(bind(S, $this))) {
    $self->{'STATE'} = -3;
    $self->state_message;
    return;
    }

  my $time1 = time();
  if (!(connect(S,$that))) { 
    $self->{'STATE'} = -4;
    $self->state_message;
    return;
    }

  select(S); $| = 1; select(STDOUT);
  my $old_separator = $/; $/ = "\n";

  my $time2 = time();
  print S "GET $path HTTP/1.0\n";
  print S "User-Agent: $USERAGENT\n";
  print S "Accept: */*\n";

  if( $do_cookies ){
    my $ck = $self->list_cookies;
    if( $ck ){ #if there are any cookies
      print S "Cookie: ",$ck,"\n";
      webpage->log("COOKIE: $ck");
      }
    }

  my $auth = shift @{$self->{'AUTHLIST'}};
  if( defined $self->realm ){
    while( 1 ){
      last if $self->realm =~ $auth->{'REALM'};
      last unless defined($auth = pop @{$self->{'AUTHLIST'}});
      }
    }

  if( $auth->{'AUTH'} ){
    print S 'Authorization: Basic ' . $auth->{'AUTH'} . "\n";
    }

  print S "\n";
  binmode S;
  $response = <S>;
  chomp $response;
  ( $self->{'PROTOCOL'} , $self->{'STATE'} ) = split(/ /, $response);
  $self->state_message;
  webpage->log('HEADER State: ' . $self->{'STATE'});
  $self->{'CONTENT_LENGTH'} = 0;
  while(<S>){
    chomp;
    if( $_ eq chr(13) || $_ eq "" ){
      last;
      }

    while( s/\n$// || s/\r$// ){}# delete all \n and \r from the end
                                 # hopefully there are no such characters inside

    /([^\s\:]+):\s+(.*)/; # split the header line

    # store the header information in the header hash
    # if a header has multiple values then the second and the rest
    # is stored in the header continuation hash array
    if( defined $self->{'HEADER'}->{lc($1)} ){
      push @{$self->{'HEADERc'}->{lc($1)}} , $2;
      }else{
      $self->{'HEADER'}->{lc($1)} = $2;
      }
    webpage->log("HEADER $1: $2");
    }

  $self->set_all_cookies if $do_cookies;

  $self->{'CONTENT'} = '';
  # some servers report extraordinary large content-length (a few gigs), then it fails.
  if( $cl_believe &&
      $pagesizelimit &&
      $self->{'HEADER'}->{'content-length'} &&
      $pagesizelimit < $self->{'HEADER'}->{'content-length'} ){
    $self->{'STATE'} = -7;
    $self->state_message;
    return;
    }

  my $total = 0;
  for( $i=read(S,$buffer,1024) ; $i > 0 ; $i=read(S,$buffer,1024)){
    $total += $i; # count the total downloaded bytes
    if( $pagesizelimit && $pagesizelimit < $total ){
      $self->{'CONTENT'} = ''; # exit if file is too large
      $self->{'STATE'} = -7;
      $self->state_message;
      return;
      }
    $self->{'CONTENT'} .= $buffer;
    }

  my $time3 = time();
  close(S);
  $self->{'CONTENT_LENGTH'} = length $self->{'CONTENT'};

  my $dnrate;
  if(  $time3 - $time2 != 0 ){
    $dnrate = length($self->{'CONTENT'}) / ($time3 - $time2) / 128;
    if( $dnrate =~ /(\d+).(\d*)/ ){
      $dnrate = $1 . '.' . ($2 ? substr($2,0,2) : '00');
      }else{
      $dnrate .= '.00';
      }
    }else{ $dnrate = 'N/A ' }
  webpage->log('RETRIEVED ' . length($self->{'CONTENT'}) .
                 'bytes; Connect time=' . ($time2-$time1) .
                 'sec; Download time=' . ($time3 - $time2) .
                 'sec; Speed=' . $dnrate . 'Kbps; state=' . $self->{'STATE'});

  $/ = $old_separator;
  $self->fetch_state('RETRIEVED');
  if( $self->content_type ne 'text/html' ){
    $self->create_file;
    }
  return;
  }

=pod
=section realm
=H Return the realm of a page

Returns the realm that the webserver named for authentication when the page was
downloaded (or at least there was a trial to download it).

As a side effect this method also calculates the R<auth_type> value for the page.

Usage:
=verbatim
               $page->realm
=noverbatim

=cut
sub realm {
  my $self = shift;

  my $www_authenticate;

  return $self->{'REALM'} if $self->{'REALM'};

  if( defined($www_authenticate=$self->{'HEADER'}->{'www-authenticate'}) ){
    if( $www_authenticate =~ /\s*(\w+)\s*realm=(.*)$/ ){
      $self->{'AUTH_TYPE'} = lc $1;
      $self->{'REALM'} = $2;
      }else{ return undef }
    }else{ return undef }
  }

=pod
=section auth_type
=H Return the authentication type of a page

Returns the authentication type that the webserver named for authentication when the page was
downloaded (or at least there was a trial to download it).

This type should be T<basic> or we are out of luck.

Usage:
=verbatim
               $page->auth_type
=noverbatim

If this parameter was not extracted yet from the header lines before this method calls
R<realm> to perform the calculation.

=cut
sub auth_type{
  my $self = shift;

  $self->realm;
  return $self->{'AUTH_TYPE'};
  }

=pod
=section file
=H Create a file name for an url

Return the file name of the page where it is going to be saved or where
it was saved.

Usage:
=verbatim
                 $page->file
=noverbatim
=cut
sub file {
  my $self = shift;
  return $self->{'FILE'} if defined $self->{'FILE'};

  $self->get; # get it to have the content type

  # method get calls create file for all non text/html mime types
  # which means calling this method recursively. So when it
  # calculates and returns to create_file and then returns to method get
  # and returns to here this parameter is already calculated...
  return $self->{'FILE'} if defined $self->{'FILE'};

  my $URL = webpage->normalize_url($self->url);
  my $path;
  my $port;

  if( $URL =~ m{^http://([\w-\.]+):?(\d*)(/.*)?} ){
    $host = $1; #URL host
    $port = $2; #URL port
    $path = $3; #URL path
    }
  if ($port eq '') { $port = 80; }
  if( $path =~ m{^/} ){ $path = substr($path,1) }

  # convert all characters that are active like ?g=1&h%20=14
  $path =~ tr/\?/\//;
  $path =~ tr/\=/\//;
  $path =~ tr/\&/\//; # to avoid too long file names
  $path =~ s/\$/\$24/g; # convert $ to $24
  # convert all other character to $xx where xx is hexa ASCII
  while( $path =~ m{([^\d\w./\$\-\_])} ){
    my $orc = $1;
    my $rep = sprintf("%02X",ord($orc));
    $orc = quotemeta $orc;
    $path =~ s/$orc/\$$rep/g;
    }
  $path =~ s{//}{/a/}g; # sometimes it happens that there are two neighbouring characters converted to /
  my @dlist = split(/\./ , $host);
  @dlist = reverse( @dlist );

  push @dlist , $port;

  if( $map_method eq 'simple' ){
    @dlist = ();
    }

  push @dlist , split(/\// ,$path);

  my $file_name = pop @dlist;

  push @dlist , $file_name;

  if( $file_name =~ /\.([^.]*)$/ ){
    my $extension = $1;
    my $pextarr = $mime_extension{$self->content_type};
    my $append = '.' . $pextarr->[0];
    for( @{$pextarr} ){
      my $pat = $_;
      if( substr($pat,0,1) eq '~' ){
        $pat = substr($pat,1);
        if( lc($extension) =~ /$pat/ ){
          $append = '';
          last;
          }
        }else{
        if( lc($extension) eq $_ ){
          $append = '';
          last;
          }
        }
      }

    $file_name .= $append;
    pop @dlist;
    push @dlist , $file_name;
    }else{
    if( defined($file_name=$self->{'HEADER'}->{'content-location'})){
      my @loca = split(/\//,$file_name);
      $file_name = pop @loca;
      }else{
      if( defined($mime_extension{$self->content_type}) ){
        $file_name = 'index.' . $mime_extension{$self->content_type}->[0];
        }else{ $file_name = 'index.html'; } # we are practically lost at this point
      }
    push @dlist , $file_name;
    }

  if( $map_method eq 'flat' ){
    $file_name =~ /\.([^.]*)$/;
    $self->{'FILE'} = $map_counter++ . ".$1";
    }else{
    $self->{'FILE'} = join('/',@dlist);
    }
  }

=pod
=section file_exists
=H Checks file existence

Returns true if the file of the page already exists.

Usage:
=verbatim
                $page->file_exists
=noverbatim

This method is not used in current version, but somehow
it survived during development. Zap it!
=cut
sub file_exists {
  my $self = shift;

  my $file = $self->cwd . '/' . $self->file;
  -e $file;
  }

=pod
=section create_file
=H Create the file for a page

Create a new file and save the content of the page into it.

Usage:
=verbatim
                     $page->create_file
=noverbatim
=cut
sub create_file {
  my $self = shift;

  return if   $self->fetch_state eq 'SAVED';

  my $file = $self->cwd . '/' . $self->file;

  $file =~ s{\\}{/}g; # convert all \ to /
  $file =~ s{//}{/}g; #delete double slashes if any remained

  my @dlist = split( m{/} , $file);

  pop @dlist; #remove the trailing file name

  if( $#dlist == -1 ){ return; }#this is a simple file name in the current directory

  my $cwd = shift @dlist;#take the first subdirectory
  if( ! -d $cwd ){#if does not exist create it
    mkdir $cwd,$umask;
    }

  for( @dlist ){
    $cwd .= "/$_";#take the next subdirectory
    if( ! -d $cwd ){
      mkdir $cwd,$umask;  #if does not exist create it
      }
    }
  webpage->log("SAVED $file of type " . $self->content_type);
  my $FH = $self->{'URL'};
  open($FH,">$file") or die "$file can not be opened.";
  binmode $FH unless $self->content_type =~ /^text/;
  print $FH $self->{'CONTENT'};
  close $FH;
  $self->fetch_state('SAVED');
  $self->destroy;
  }

=pod
=section length
=H length of the content of a page
This method returns the length of the content of a page.
The length is set by the method GET after the page was retrieved.

This method is needed because older version used T<length $page->>T<{'CONTENT'}>
but that does not work after the page was saved and destroyed.
=cut
sub length {
  my $self = shift;
  return $self->{'CONTENT_LENGTH'};
  }

=pod
=section text_leaf
=H Create a leaf text as HTML content

Create a text and put it into the content of the page. The content
says that the page was not downloaded and can be found here and there.

There is also some meta tag that helps the navigation automatically
bringing the browser to the original URL. (Refresh after 1 sec.)
=cut
sub text_leaf {
  my $self = shift;
  my $url = $self->url;
    $self->{'CONTENT'} = <<END_HTML;
<HTML>
<HEAD>
<TITLE>Page not retrieved</TITLE>
<META HTTP-EQUIV="Refresh" CONTENT="1; URL=$url">
<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
</HEAD>
<BODY>
<FONT FACE="Verdana" SIZE="2">
This page was not retrieved, and can be found at its original location: <A HREF="$url">$url</A>
</FONT>
</BODY>
</HTML>
END_HTML
  }


=pod
=section base64
=H Encodes argument using base64 encoding

This method codes a string to its base64 format as needed by basic authentication.

Usage:
=verbatim
                webpage->base64
=noverbatim
=cut
sub base64{
    shift;
    my $res = "";

    pos($_[0]) = 0;                          # ensure start at the beginning
    while ($_[0] =~ /(.{1,45})/gs) {
	$res .= substr(pack('u', $1), 1);
	chop($res);
    }
    $res =~ tr|` -_|AA-Za-z0-9+/|;               # `# help emacs
    # fix padding at the end
    my $padding = (3 - length($_[0]) % 3) % 3;
    $res =~ s/.{$padding}$/'=' x $padding/e if $padding;
    $res;
}


=pod
=section text_badstate
=H Create a leaf text as HTML content

Create a text and put it into the content of the page. The content
says that the page was not downloaded because an error
and can be found here and there if luck is better now.

There is also some meta tag that helps the navigation automatically
bringing the browser to the original URL. (Refresh after 1 sec.)
=cut
sub text_badstate {
  my $self = shift;
  my $state_message = $self->state_message;
  my $status = $self->{'STATE'};
  my $proxy = $self->proxy;
  my $url = $self->url;
  $self->{'HEADER'}->{'content-type'} = 'text/html';
  $self->{'CONTENT'} = <<END_HTML;
<HTML>
<HEAD>
<TITLE>Page not retrieved</TITLE>
<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
</HEAD>
<BODY>
<FONT FACE="Verdana" SIZE="2">
This page was not retrieved, because the webserver
(or the proxy $proxy) returned an error code
instead of the resource.<p>
The error code was $status.<p>
The official explanation of this code is:<BR>
<center>
$state_message
</center>
<p>
You can try to retrieve the resource right now from <a href="$url">$url</a>
<p>
Good luck!
</FONT>
</BODY>
</HTML>
END_HTML
  }

=pod
=section text_redirect
=H Create a leaf text as HTML content

Create a text and put it into the content of the page. The content
says that the page was redirected and points to the file, where the
redirected page was saved.

There is also some meta tag that helps the navigation automatically
bringing the browser to the new page. (Refresh after 1 sec.)
=cut
sub text_redirect{
  my $self = shift;
  my $file = shift;

  $self->{'HEADER'}->{'content-type'} = 'text/html'; # it is probably already, but who knows
  $self->{'CONTENT'} = <<END_HTML;
<HTML>
<HEAD>
<TITLE>Page redirected</TITLE>
<META HTTP-EQUIV="Refresh" CONTENT="1; URL=$file">
<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
</HEAD>
<BODY>
<FONT FACE="Verdana" SIZE="2">
This page has been moved to <A HREF="$file">$file</A>
</FONT>
</BODY>
</HTML>
END_HTML
  }
=pod
=section history
=H Version History of webmirror pro

Webmirror history

=itemize
=item T<2.05> Directive T<level inf> now works and is the same as T<level 0>
=item T<    > Anchors now seem to work.
=item T<2.04> Directive T<directget> is introduced and sending full URL for non proxy get also.
=item T<2.03> new T<sub length> and bug fix in T<sub level>. Previous versions in this sub used the variable T<$page>
instead of T<$self> which was a typo. The result was that some pages were not retrieved because the program though they
were too deep, altough they were not.
=item T<2.02> bug fix : some files were saved twice. The problem was that the second save was done with
the destroyed content and it created zero length files.
=item T<2.01> cookie handling
=item T<2.0> new version with total redesign
=item T<1.0> old version
=noitemize

=cut