# # webmirror # # Webmirror is a Perl script that mirrors a whole web site. # webmirror starts reading web pages from the homepage and recursively # follows all links specified inthe html files that point to a page # on the same machine and retrieves the html, gif and all other files # in a directory. # # Webmirror automatically generates subdirectories to store the files. # # USAGE: # # start webmirror from the command line: # webmirror webroot startpage rootdirectory depth size Pictures # # where # webroot is the URL part under which pages are collected # startpage is the web page where retrieving starts # relative to webroot, in other words # webroot/startpage is the URL of the starting web page # rootdirectory is the directory where the retrieved pages are stored # depth is the maximal number of followed links 'down' in the web # for HTML files, gifs and other picture files are # followed one level deeper # depth = 0 means no limit # size is the maximal size of bytes to be retrieved, can be # postfixed with letter K or M for kilo or mega bytes # size = 0 means no limit # Pictures should be the capital letter 'P' to retrieve the # pictures. # # AUTHOR: Peter Vehas # # DATE: January 1, 1997. # # VERSION: 0.98 beta # # BUGS: # # This version does not follow the robot exclusion standard. # THIS IS A MAJOR BUG, THEREFORE YOU SHOULD NOT USE THIS PROGRAM # IN A PRODUCTION ENVIRONMENT! USE IT ONLY FOR EXPERIMENTAL PURPOSES! # # This version does not handle well those absolute links that point to the # same web. This can happen with some CGI generated pages that generate # absolute URLs to reach other local pages. # # URLs to a directory with default html page will be stored locally under # the name 'default.htm'. If the server has different setting and there are # back links to the eg. index.html page, then the page will be retrieved twice. # However you can treat this as a feature. Change default.htm to some # other file name if the local server has other setting. We have tested the # script using WNT4.0 IIS3.0 with default settings. # # # $debug = 1; @Proxy = ( #'www-proxy1.myproxy.com','www-proxy2.myproxy.com' ); $NoProxy = '.*\.mydomain\.com'; $Default = "default.htm"; $umask = 0666; %OkStatusMsgs = ( 200, "OK 200", 201, "CREATED 201", 202, "Accepted 202", 203, "Partial Information 203", 204, "No Response 204", ); %FailStatusMsgs = ( -1, "Could not lookup server", -2, "Could not open socket", -3, "Could not bind socket", -4, "Could not connect", -5, "Retrieved file small", -6, "Save file cannot be opened", 301, "Found, but moved", 302, "Found, but data resides under different URL (add a /)", 303, "Method", 304, "Not Modified", 400, "Bad request", 401, "Unauthorized", 402, "PaymentRequired", 403, "Forbidden", 404, "Not found", 500, "Internal Error", 501, "Not implemented", 502, "Service temporarily overloaded", 503, "Gateway timeout ", 600, "Bad request", 601, "Not implemented", 602, "Connection failed (host not found?)", 603, "Timed out", ); # Each proxy has a fail counter. # We always use the proxy having smallest number for( 0 ... $#Proxy ){$ProxyRate[$_] = 0;} # # Get the parameters # # webmirror http://www.digital.com index.html c:\digitalweb 20 1M P [UE] # ^ ^ ^ ^ ^ ^ ^ # | | | | | | | $RootURL = shift;#---+ | | | | | | $StartURL= shift;#----------------------+ | | | | | $RootDir = shift;#---------------------------------+ | | | | $Depth = shift;#------------------------------------------+ | | | $MaxSize = shift;#----------------------------------------------+ | | $Pictures= shift;#------------------------------------------------+ | $Update = shift;#--------------------------------------------------+ if( !$RootURL ){ print <$LogFileName"); select(LogFile);$|=1;select(STDOUT); #appent / after rootdir if needed if( $RootDir !~ /.*\/$/ ){ $RootDir = $RootDir . "/"; } # insert http:// if needed if ($RootURL !~ m#^http://.*#i) { $RootURL = 'http://' . $RootURL; } if( $RootURL =~ m#^http://([\w-\.]+):?(\d*)(/.*)?#) { $RootHost = $1; #URL host $RootPath = $3; #URL path } #No trailing / if( $RootPath =~ m#.*/$# ){ chop $RootPath; } #$StartURL should have a leading / if( $StartURL !~ m#^/# ){ $StartURL = "/" . $StartURL; } #make $RootPath a proper search pattern $RootPath = quotemeta($RootPath); %ToDo = ( $RootURL . $StartURL ,1 ); # URL , deepness of the URL. 0 if retrieved. @URLs = keys %ToDo; $TotalDownloadedBytes = 0; # retrieve all files that ARE on the ToDo list while( $#URLs != -1 ){ if( $MaxSize > 0 && $TotalDownloadedBytes > $MaxSize ){last;} $URL = $URLs[0]; $URL =~ s#http://##; $URL =~ s#//#/#g; $URL = 'http://' . $URL; $UseProxy = ($URL !~ $NoProxy); if( $debug ){ print "retrieving $URL\n"; } print LogFile "Retrieving $URL "; if( !$UseProxy ){ print LogFile "not using proxy.\n";} else { print LogFile "using proxy.\n"; } if( $UseProxy ){ #find the proxy with the smallest fail rate $i = 0; $Rate = $ProxyRate[0]; for( 1 ... $#Proxy ){ if( $ProxyRate[$_] < $Rate ){ $Rate = $ProxyRate[$_]; $i = $_; } } } $SaveFile = &MakeFileName($RootDir,$RootPath,$URL); if( $SaveFile eq "" ){#this file is NOT under the root directory $ToDo{$URLs[0]}=0; shift @URLs; next } if( $UseProxy ){ print LogFile "Getting $URL through ",$Proxy[$i],"\n"; $result = &getURL($URL,$SaveFile,$Proxy[$i]); }else{ print LogFile "Getting $URL\n"; $result = &getURL($URL,$SaveFile); } if( $result >= 200 && $result < 300 ){ print LogFile "succesful retrieval.\n Result code is $result meaning: ",$OkStatusMsgs{$result},"\n"; if( $ContentType =~ m#text/html# ){ &WupFile($SaveFile,$ToDo{$URL},$URL); } $ToDo{$URL} = 0; shift @URLs; next; } print LogFile "Unsuccesful trial ", $FailStatusMsgs{$result},"\n"; if( $result == -1 || $result == -4 ){ if( $UseProxy ){ $ProxyRate[$i] ++; } $URL = shift @URLs; #take it from the start and push @URLs,$URL; # put it on the end next; } if( $result == -2 || $result == -3 ){ $URL = shift @URLs; #take it from the start and push @URLs,$URL; # put it on the end next; } if( $result == -5 ){ if( $UseProxy ){ $ProxyRate[$i] ++; } $URL = shift @URLs; #take it from the start and push @URLs,$URL; # put it on the end next; } if( $result == 301 || $result == 302 ){ $URL = $NewLocation; $ToDo{$URL} = $ToDo{$URLs[0]}; $ToDo{$URLs[0]} = 0; #treat this as downloaded shift @URLs; push @URLs,$URL; next; } if( $result >= 300 ){ if( $UseProxy ){ $ProxyRate[$i] ++; } $URL = shift @URLs; #take it from the start and if( $result == 502 || # Service temporarily overloaded $result == 503 || # Gateway timeout $result == 603 # Timed out ){ push @URLs,$URL;} # put it on the end next; } } exit; sub getURL { my($URL) = shift; my($SaveFile) = shift; my($PROXY) = shift; if( $Upload == 2 ){#extend the web, do not check anything if( -e $SaveFile ){ return 200; } } if( $PROXY ){#using proxy $PROXY =~ m#([\w-\.]+):?(\d*)# ; $host = $1; #proxy host $port = $2; #proxy port $path = $URL; #Using proxy the path is the full URL if ($port eq "") { $port = 8080; } }else{#if do not use proxy if ($URL =~ m#^http://([\w-\.]+):?(\d*)(/.*)?#) { $host = $1; #URL host $port = $2; #URL port $path = $3; #URL path } if ($path eq "") { $path = '/'; } if ($port eq "") { $port = 80; } } $AF_INET = 2; $SOCK_STREAM = 1; $sockaddr = 'S n a4 x8'; ($name,$aliases,$proto) = getprotobyname('tcp'); ($name,$aliases,$type,$len,$thisaddr) = gethostbyname($hostname); if (!(($name,$aliases,$type,$len,$thataddr) = gethostbyname($host))) { return -1; } $this = pack($sockaddr, $AF_INET, 0, $thisaddr); $that = pack($sockaddr, $AF_INET, $port, $thataddr); # Make the socket filehandle. if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))) { return -2; } # Give the socket an address if (!(bind(S, $this))) { return -3; } if (!(connect(S,$that))) { return -4; } select(S); $| = 1; select(STDOUT); print S "GET $path HTTP/1.0\n"; print S "User-Agent: pWErliB Mirror Agent/1.1\n"; print S "Accept: */*\n"; print S "\n"; binmode S; print LogFile "Response header:\n"; $response = ; chomp $response; print LogFile " $response\n"; my($protocol, $status) = split(/ /, $response); $ContentLength = 0; while(){ chomp; if( $_ eq chr(13) || $_ eq "" ){ last; } print LogFile " $_\n"; #print the header lines if( /Location: (.*)/ ){ $NewLocation = $1; chop $NewLocation; } if( /Content-Type: (.*)/i ){ $ContentType = $1; } #take the length to recongize that all the file arrived if( /Content-Length:\s*(\d+)\s*/ ){ $ContentLength = $1; } } if( -d $SaveFile ){ $SaveFile .= "/$Default"; } if( open(F,">$SaveFile") ){ binmode F; if( $ContentLength > 0 ){ $i=read(S,$Buffer,$ContentLength) } else { for( $i=read(S,$Buffer,1024) ; $i > 0 ; $i=read(S,$Buffer,1024)){ print F $Buffer; } } close F; } else { print LogFile "Output file $SaveFile can not be opened.\n"; print LogFile "This file is NOT retrieved.\n"; return -6; } close(S); if( $ContentLength > 0 ){ if( $ContentLength == -s $SaveFile ){ print LogFile "Size of generated file equals Content-Length.\n"; } elsif( $ContentLength < -s $SaveFile ){ print LogFile "Size of generated file ", -s $SaveFile , " is larger than Content-Lentgth: $ContentLength indicated.\n"; } else { print LogFile "Size ",-s $SaveFile ," of generated file $SaveFile", " is smaller than Content-Lentgth: $ContentLength indicated.\n"; print LogFile " Trying to retrieve again.\n"; return -5; } } return $status; } # # &MakeDirectory localpages/html/web/www/index.html # # creates the directory localpages/html/web/www # # sub MakeDirectory { my($Dir) = shift; $Dir =~ s#//#/#; #delete double slashes if any remained my(@dlist) = split(/\//,$Dir); pop @dlist; #remove the trailing file name if( $#dlist == -1 ){ return; }#this is a simple file name in the current directory $cwd = shift @dlist;#take the first subdirectory if( ! -d $cwd ){#if does not exist create it mkdir $cwd,$umask; } for( @dlist ){ $cwd .= "/$_";#take the next subdirectory if( ! -d $cwd ){ if( $debug ){ print "making $cwd\n"; } mkdir $cwd,$umask; #if does not exist create it } } } # # &MakeFileName dire/dur/dan ses/sas/sus http://www.kak.com/ses/sas/sus/toty/tit/brat.html # # creates dire/dur/dan/toty/tit # # and returns dire/dur/dan/toty/tit/brat.html # sub MakeFileName { my($RootDir) = shift; my($RootPath) = shift; my($URL) = shift; my($SaveFile); my($path); if( $URL =~ m#^http://([\w-\.]+):?(\d*)(/.*)?# ) { $path = $3; #URL path }else{ $path = '/'; } if( $path =~ m#$RootPath/(.*)# ){ $path = $1; }else{ return ""; }#this is not under the RootPath $SaveFile = $RootDir . $path; if( $SaveFile =~ /\?/ || $SaveFile =~ /\=/ || $SaveFile =~ /\&/ ){ $SaveFile = &ConvertActiveURL($SaveFile); } if( $SaveFile !~ /\./ ){#if there is no extension, then this is probably a directory, but if not, who cares? $SaveFile .= "/$Default"; } &MakeDirectory( $SaveFile ); return $SaveFile; } sub ConvertActiveURL { $_ = shift; print LogFile "$_ is converted to "; tr/\?/\//; tr/0-9a-zA-Z.\//_/c; print LogFile "$_\n"; return $_; } # # Work Up a file regarding URLs # sub WupFile { my($FileName) = shift; my($Level) = shift; my($URL) = shift; #of the file we work up my($URLbase,$path,@URLlf,@pathlf,@file,$host,$Rpath,$Ofile,$Changed,$Cpath,$Spath,$dlevel); my($Http_follow) = 1; if( -d $FileName ){ $FileName .= "/$Default"; } $FileName =~ m#$RootDir(.*)$#; $_ = $1; $dlevel = s#/##g -1; print LogFile "Working up $FileName at $Level from $URL\n"; $URL =~ s#/[^/]*$##; #delete the trailing file name if any $URL =~ s#(http://[^/]*)##;#delete the host part $URLbase = $1; $URL =~ s#^/##;#delete leading / if any # # if the file is deeper, than the maximal required deepness, then # forget it, however the pictures are followed one level deeper # otherwise the leaf pages would appear w/o pictures # if( $Depth ){# $Depth == 0 means all levels (full web) if( $Level > $Depth ){ return; } if( $Level == $Depth && ! $Pictures ){ return; } $Http_follow = ( $Level < $Depth ); } $Level ++; if( ! open(F,"<$FileName") ){return;} @file = ; #an HTML file should not be so large close F; $Ofile = join('',@file); #save the original lines $Changed = 0; #we haven't changed any URL references yet $TotalDownloadedBytes += -s $FileName; chomp @file; $html_text = join(' ',@file); @file = (); #just to save some memory REFERENCE: while( 1 ){ # take the next HREF="$path" or SRC="$path" if( $Http_follow && $html_text =~ /href\s*=\s*\"([^\"]*)\"/i ){ $path = $1; $html_text =~ s/href\s*=\s*\"[^\"]*\"//i; #do not find this ref again $path =~ s/\#.*$//; #remove the anchor if any } elsif( $Pictures && $html_text =~ /src\s*=\s*\"([^\"]*)\"/i ){ $path = $1; $html_text =~ s/src\s*=\s*\"[^\"]*\"//i;#do not find this ref again } # take the next HREF=path or SRC=$path lasy html not using " characters ... elsif( $Http_follow && $html_text =~ /href\s*=([^\s\>]*)/i ){ $path = $1; $html_text =~ s/href\s*=[^\s\>]*//i; #do not find this ref again $path =~ s/\#.*$//; #remove the anchor if any } elsif( $Pictures && $html_text =~ /src\s*=([^\s\>]*)/i ){ $path = $1; $html_text =~ s/src\s*=[^\s\>]*//i;#do not find this ref again } else{last} if( $path !~ m#http://.*# ){#This is not an absolute URL if( $path eq "" ){ next; }#this was probably an anchor within the doc $Spath = $path; $Spath =~ s/\?/\\\?/g;#escape regexp characters $Spath =~ s/\//\\\//g; $Spath =~ s/\&/\\\&/g; $Spath =~ s/\*/\\\*/g; $Spath =~ s/\=/\\\=/g; if( $path =~ /\?/ || $path =~ /\=/ || $path =~ /\&/ ){#this is an active page $Changed = 1; $Cpath = &ConvertActiveURL($path); $Ofile =~ s/$Spath/$Cpath/m;#change only the first, as others might fool the engine if changed before wup } if( $path !~ /\./ ){#this is a reference to a directory (probably) $Changed = 1; $Cpath = "$path/$Default"; $Ofile =~ s/$Spath/$Cpath/m;#change only the first, as others might fool the engine if changed before wup } if( $path =~ m#^/# ){#this is an absolute reference to the root directory of the web server, should be changed to relative $Changed = 1; $path =~ s#^/##;#cut the starting / $path = ('../' x $dlevel ). $path; $Ofile =~ s/$Spath/$path/m;#change only the first, as others might fool the engine if changed before wup } $path =~ s#^http:##; #remove leading 'http:' if any if( $path =~ m#^\w*:# ){ next; }# we follow only http links @pathlf = split(/\//,$path); @URLlf = split(/\//,$URL); for( @pathlf ){ if( $_ eq '..' ){ if( $#URLlf == -1 ){#nothing to pop from, this points higher than base URL next REFERENCE; #forget it } pop @URLlf;#go one directory higher } else { push @URLlf, $_; }#go one directory deeper } $path = join('/',@URLlf); #put the URL together if( $URLbase =~ m#/$# || $path =~m#^/# ){ $path = "$URLbase$path"; } else { $path = "$URLbase/$path"; } } else { next; #in this version we do not work up absolute URLs # even if they point to the same web } if( ! exists $ToDo{$path} ){ print LogFile "Inserting $path with $Level on todo list.\n"; $ToDo{$path} = $Level; push @URLs,$path; } } if( $Changed ){ print LogFile "$FileName is converted for active URLs\n"; if( ! open(F,">$FileName") ){return;} print "$FileName is converted for active URLs\n"; print F $Ofile; close F; } } __END__