# NOPOD NOTICE: the documentation and whitespace have been stripped # from this file in order to reduce filesize. # #!/usr/bin/perl # LW2 version 2.1 # # LW2 copyright 2000-2004 by rfp.labs # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # package LW2; $LW2::VERSION="2.1"; $PACKAGE='LW2'; BEGIN { package LW2; $PACKAGE='LW2'; %AVAILABLE = (); $LW_SSL_LIB = 0; $LW_NONBLOCK_CONNECT=0; eval "require MIME::Base64"; if(!$@){ $AVAILABLE{'mime::base64'}=$MIME::Base64::VERSION; 1 if($MIME::Base64::VERSION); } eval "require MD5"; if(!$@){ $AVAILABLE{'md5'}=$MD5::VERSION; 1 if($MD5::VERSION); } eval "use Socket"; if(!$@){ $AVAILABLE{'socket'}=$Socket::VERSION; eval "use Net::SSLeay"; # do we have SSL support? if(!$@){ $LW_SSL_LIB=1; $AVAILABLE{'net::ssleay'}=$Net::SSLeay::VERSION; 1 if($Net::SSLeay::VERSION); Net::SSLeay::load_error_strings(); Net::SSLeay::SSLeay_add_ssl_algorithms(); Net::SSLeay::randomize(); } else { eval "use Net::SSL"; if(!$@){ $LW_SSL_LIB=2; $AVAILABLE{'net::ssl'}=$Net::SSL::VERSION; 1 if($Net::SSL::VERSION); } } if($^O!~/Win32/){ eval "use POSIX qw(:errno_h :fcntl_h)"; if(!$@){ $LW_NONBLOCK_CONNECT=1; } } } } # BEGIN sub auth_brute_force { my ($auth_method, $hrin, $user, $pwordref, $dom, $fail_code)=@_; my ($P,%hout); $fail_code||=401; return undef if(!defined $auth_method || length($auth_method)==0); return undef if(!defined $user || length($user) ==0); return undef if(!(defined $hrin && ref($hrin) )); return undef if(!(defined $pwordref && ref($pwordref))); map { ($P=$_)=~tr/\r\n//d; auth_set_header($auth_method,$hrin,$user,$P,$dom); return undef if(http_do_request($hrin,\%hout)); return $P if($hout{whisker}->{code} != $fail_code); } @$pwordref; return undef;} sub auth_unset { my $href=shift; return if(!defined $href || !ref($href)); delete $$href{Authorization}; delete $$href{'Proxy-Authorization'}; delete $$href{whisker}->{auth_callback}; delete $$href{whisker}->{auth_proxy_callback}; delete $$href{whisker}->{auth_data}; delete $$href{whisker}->{auth_proxy_data}; } sub auth_set { my ($method, $href, $user, $pass, $domain)=(lc(shift),@_); return if(!(defined $href && ref($href))); return if(!defined $user || !defined $pass); if($method eq 'basic'){ $$href{'Authorization'}='Basic '.encode_base64($user.':'.$pass,''); } if($method eq 'proxy-basic'){ $$href{'Proxy-Authorization'}='Basic '.encode_base64($user.':'.$pass,''); } if($method eq 'ntlm'){ http_close($href); $$href{whisker}->{auth_data}=ntlm_new($user,$pass,$domain); $$href{whisker}->{auth_callback}=\&_ntlm_auth_callback; } if($method eq 'proxy-ntlm'){ die("Libwhisker error: proxy-ntlm auth w/ SSL not currently supported") if($href->{whisker}->{ssl}>0); http_close($href); $$href{whisker}->{auth_proxy_data}=ntlm_new($user,$pass,$domain); $$href{whisker}->{auth_proxy_callback}=\&_ntlm_auth_proxy_callback; } } sub cookie_read { my ($count,$jarref,$href)=(0,@_); return 0 if(!(defined $jarref && ref($jarref))); return 0 if(!(defined $href && ref($href) )); return 0 if(!(defined $$href{whisker}->{cookies} && ref($$href{whisker}->{cookies}))); foreach (@{$href->{whisker}->{cookies}}){ cookie_parse($jarref,$_); $count++; } return $count; } sub cookie_parse { my ($jarref, $header)=@_; my ($del,$part,@parts,@construct,$cookie_name)=(0); return if(!(defined $jarref && ref($jarref))); return if(!(defined $header && length($header)>0)); @parts=split(/;/,$header); foreach $part (@parts){ if($part=~/^[ \t]*(.+?)=(.*)$/){ my ($name,$val)=($1,$2); if($name=~/^domain$/i){ $val=~s#^http://##; $val=~s#/.*$##; $construct[1]=$val; } elsif($name=~/^path$/i){ $val=~s#/$## if($val ne '/'); $construct[2]=$val; } elsif($name=~/^expires$/i){ $construct[3]=$val; } else { $cookie_name=$name; if($val eq ''){ $del=1; } else { $construct[0]=$val;} } } else { if($part=~/secure/){ $construct[4]=1;} } } if($del){ delete $$jarref{$cookie_name} if defined $$jarref{$cookie_name}; } else { $$jarref{$cookie_name}=\@construct; } } sub cookie_write { my ($jarref, $hin, $override)=@_; my ($name,$out)=('',''); return if(!(defined $jarref && ref($jarref))); return if(!(defined $hin && ref($hin) )); $override=$override||0; $$hin{'whisker'}->{'ssl'}=$$hin{'whisker'}->{'ssl'}||0; foreach $name (keys %$jarref){ next if($name eq ''); next if($$hin{'whisker'}->{'ssl'}==0 && $$jarref{$name}->[4]>0); if($override || ($$hin{'whisker'}->{'host'}=~/$$jarref{$name}->[1]$/i && $$hin{'whisker'}->{'uri'}=~/$$jarref{$name}->[2]/i)){ $out.="$name=$$jarref{$name}->[0];"; } } if($out ne ''){ $$hin{'Cookie'}=$out; } } sub cookie_get { my ($jarref,$name)=@_; return undef if(!(defined $jarref && ref($jarref))); if(defined $$jarref{$name}){ return @{$$jarref{$name}};} return undef; } sub cookie_set { my ($jarref,$name,$value,$domain,$path,$expire,$secure)=@_; my @construct; return if(!(defined $jarref && ref($jarref))); return if($name eq ''); if($value eq ''){ delete $$jarref{$name}; return;} $path=$path||'/'; $secure=$secure||0; @construct=($value,$domain,$path,$expire,$secure); $$jarref{$name}=\@construct; } %_crawl_config=('save_cookies' => 0, 'reuse_cookies' => 1, 'save_offsites' => 0, 'save_non_http' => 0, 'follow_moves' => 1, 'url_limit' => 1000, 'use_params' => 0, 'params_double_record' => 0, 'skip_ext' => { gif=>1, jpg=>1, png=>1, gz=>1, swf=>1, pdf=>1, zip=>1, wav=>1, mp3=>1, asf=>1, tgz=>1 }, 'save_skipped' => 0, 'save_referrers'=> 0, 'do_head' => 0, 'callback' => 0, 'netloc_bug' => 1, 'normalize_uri' => 1, 'source_callback' => 0 ); %_crawl_linktags = ( 'a' => 'href', 'applet' => [qw(codebase archive code)], 'area' => 'href', 'base' => 'href', 'bgsound' => 'src', 'blockquote' => 'cite', 'body' => 'background', 'del' => 'cite', 'embed' => [qw(src pluginspage)], 'form' => 'action', 'frame' => [qw(src longdesc)], 'iframe' => [qw(src longdesc)], 'ilayer' => 'background', 'img' => [qw(src lowsrc longdesc usemap)], 'input' => [qw(src usemap)], 'ins' => 'cite', 'isindex' => 'action', 'head' => 'profile', 'layer' => [qw(background src)], 'link' => 'href', 'object' => [qw(codebase data archive usemap)], 'q' => 'cite', 'script' => 'src', 'table' => 'background', 'td' => 'background', 'th' => 'background', 'xmp' => 'href', ); sub crawl_new { my ($start, $depth, $reqref, $trackref)=@_; my %X; return undef if(!defined $start || !defined $depth); return undef if(!defined $reqref || !ref($reqref)); $trackref={} if(!defined $trackref || !ref($trackref)); $X{track} =$trackref; $X{request} =$reqref; $X{depth} =$depth||2; $X{start} =$start; $X{magic} =7340; $X{reset}= sub { $X{errors} =[]; # all errors encountered $X{urls} =[]; # temp; used to hold all URLs on page $X{server_tags} ={}; # all server tags found $X{referrers} ={}; # who refers to what URLs $X{offsites} ={}; # all URLs that point offsite $X{response} ={}; # temp; the response hash $X{non_http} ={}; # all non_http URLs found $X{cookies} ={}; # all cookies found $X{forms} ={}; # all forms found $X{jar} ={}; # temp; cookie jar $X{config} ={}; %{ $X{config} } = %_crawl_config; %{ $X{track} } =(); $X{parsed_page_count} =0; }; $X{crawl}= sub { crawl(\%X, @_) }; $X{reset}->(); return \%X; } { # START OF CRAWL CONTAINER sub crawl { my ($C, $START, $MAX_DEPTH)=@_; return undef if(!defined $C || !ref($C) || $C->{magic}!=7340); my $CONFIG = $C->{config}; my $TRACK = $C->{track}; my $URLS = $C->{urls}; my $RESP = $C->{response}; my $REQ = $C->{request}; $START ||= $C->{start}; $C->{depth} = $MAX_DEPTH || $C->{depth}; my ($COUNT, $T, @ST, @url_queue)=(0, ''); my @v=uri_split($START); my $error = undef; $error = 'Start protocol not http or https' if($v[1] ne 'http' && $v[1] ne 'https'); $error = 'Bad start host' if(!defined $v[2] || $v[2] eq ''); push ( @{ $C->{errors} }, $error ) && return undef if(defined $error); @ST = ( $v[2], $v[3], $v[0], 1, '' ); $REQ->{whisker}->{ssl} = 1 if($v[1] eq 'https'); $REQ->{whisker}->{host}=$ST[0]; $REQ->{whisker}->{port}=$ST[1]; $REQ->{whisker}->{lowercase_incoming_headers}=1; $REQ->{whisker}->{ignore_duplicate_headers} =0; delete $REQ->{whisker}->{parameters}; http_fixup_request($REQ); push @url_queue, \@ST; while(@url_queue){ @ST = @{ shift @url_queue }; next if(defined $TRACK->{$ST[2]} && $TRACK->{$ST[2]} ne '?'); if($ST[3] > $C->{depth}){ $TRACK->{$ST[2]}='?' if($CONFIG->{save_skipped}>0); next; } $ST[4]=uri_get_dir($ST[2]); $REQ->{whisker}->{uri}=$ST[2]; my $result = _crawl_do_request($REQ,$RESP,$C); if($result==1 || $result==2){ push @{ $C->{errors} }, "$ST[2]: $RESP->{whisker}->{error}"; next; } $COUNT++; $TRACK->{$ST[2]}=$RESP->{whisker}->{code} if($result==0 || $result==4); $TRACK->{$ST[2]}='?' if( ($result==3||$result==5) && $CONFIG->{save_skipped}>0); if(defined $RESP->{server} && !ref($RESP->{server})){ $C->{server_tags}->{ $RESP->{server} }++; } if(defined $RESP->{'set-cookie'}){ if($CONFIG->{save_cookies}>0){ if(ref($RESP->{'set-cookie'})){ $C->{cookies}->{$_}++ foreach (@{$RESP->{'set-cookie'}}); } else { $C->{cookies}->{$RESP->{'set-cookie'}}++; } } cookie_read($C->{jar}, $RESP) if($CONFIG->{reuse_cookies}>0); } next if($result==4 || $result==5); next if(scalar @url_queue > $CONFIG->{url_limit}); if($result==0){ # page should be parsed if($CONFIG->{source_callback} != 0 && ref($CONFIG->{source_callback}) eq 'CODE'){ &{$CONFIG->{source_callback}}($C); } html_find_tags(\$RESP->{whisker}->{data}, \&_crawl_extract_links_test,0,$C,\%_crawl_linktags); $C->{parsed_page_count}++; } push @$URLS, $RESP->{location} if($result==3); foreach $T (@$URLS){ $T=~tr/\0\r\n//d; next if (length($T)==0); next if ($T=~/^#/i); # fragment push @{$C->{referrers}->{$T}}, $ST[2] if($CONFIG->{save_referrers}>0); if($T=~/^([a-zA-Z0-9]*):/ && lc($1) ne 'http' && lc($1) ne 'https'){ push @{$C->{non_http}->{$T}}, $ST[2] if($CONFIG->{save_non_http}>0); next; } if( substr($T,0,2) eq '//' && $CONFIG->{netloc_bug}>0 ){ if( $REQ->{whisker}->{ssl}>0 ){ $T='https:'.$T; } else { $T='http:'.$T; } } if($CONFIG->{callback} != 0){ next if &{$CONFIG->{callback}}($T,$C); } $T=uri_absolute( $T, $ST[4], $CONFIG->{normalize_uri} ); @v=uri_split($T); if( (defined $v[2] && $v[2] ne $ST[0]) || ($v[3]>0 && $v[3] != $ST[1])){ $C->{offsites}->{uri_join(@v)}++ if($CONFIG->{save_offsites}>0); next; } next if(defined $TRACK->{$v[0]}); # we've processed this already if($v[0]=~/\.([a-z0-9]+)$/i){ if(defined $CONFIG->{skip_ext}->{lc($1)}){ $TRACK->{$v[0]}='?' if($CONFIG->{save_skipped}>0); next; } } if(defined $v[4] && $CONFIG->{use_params}>0){ $TRACK->{$v[0]}='?' if($CONFIG->{params_double_record}>0 && !defined $TRACK->{$v[0]}); $v[0]=$v[0].'?'.$v[4]; } push @url_queue, [ $ST[0], $ST[1], $v[0], $ST[3]+1, '' ]; } # foreach @$URLS=(); # reset for next round } # while return $COUNT; } # end sub crawl sub _crawl_extract_links_test { my ($TAG, $hr, $dr, $start, $len, $OBJ)=(lc(shift),@_); return undef if(!scalar %$hr); # fastpath quickie my $t=$_crawl_linktags{$TAG}; while( my ($key,$val)= each %$hr){ # normalize element values $$hr{lc($key)} = $val; } if($TAG eq 'meta' && defined $$hr{'http-equiv'} && $$hr{'http-equiv'} eq 'refresh' && defined $$hr{'content'} && $$hr{'content'}=~m/url=(.+)/i ){ push(@{$OBJ->{urls}},$1); } elsif(ref($t)){ foreach (@$t){ push(@{$OBJ->{urls}},$$hr{$_}) if(defined $$hr{$_}); } } else { push(@{$OBJ->{urls}},$$hr{$t}) if(defined $$hr{$t}); } if($TAG eq 'form' && defined $$hr{action}){ my $u=$OBJ->{response}->{whisker}->{uri}; $OBJ->{forms}->{ uri_absolute($$hr{action},$u,1) }++; } return undef; } sub _crawl_do_request_ex { my ($hrin,$hrout,$OBJ)=@_; my $ret; $ret=http_do_request($hrin,$hrout); return (2,$ret) if($ret==2); # if there was connection error, do not continue if($ret==0){ # successful request if($$hrout{whisker}->{code} <308 && $$hrout{whisker}->{code} >300){ if($OBJ->{config}->{follow_moves} >0){ return (3,$ret) if(defined $$hrout{location} && !ref($$hrout{location})); } return (5,$ret); # not avail } if($$hrout{whisker}->{code}==200){ if(defined $$hrout{'content-type'} && $$hrout{'content-type'}!~/^text\/htm/i){ return (4,$ret); } } } return (-1,$ret); # fallthrough } sub _crawl_do_request { my ($hrin,$hrout,$OBJ) = @_; my ($cret,$lwret); if($OBJ->{config}->{do_head} && $$hrin{whisker}->{method} ne 'HEAD'){ my $save=$$hrin{whisker}->{method}; $$hrin{whisker}->{method}='HEAD'; ($cret,$lwret)=_crawl_do_request_ex($hrin,$hrout,$OBJ); $$hrin{whisker}->{method}=$save; return $cret if($cret > 0); if($lwret==0){ # successful request if($$hrout{whisker}->{code}==501){ # HEAD not allowed $OBJ->{config}->{do_head}=0; # no more HEAD requests } } } ($cret,$lwret)=_crawl_do_request_ex($hrin,$hrout,$OBJ); return $lwret if($cret < 0); return $cret; } } # CRAWL_CONTAINER sub dump { my %what=@_; my ($final,$k,$v)=(''); while( ($k,$v)=each %what){ return undef if(ref($k) || !ref($v)); $final.="\$$k = "._dump(1,$v,1); $final=~s#,\n$##; $final.=";\n"; } return $final; } sub dump_writefile { my $file=shift; my $output=&dump(@_); return 1 if(!open(OUT,">$file") || !defined $output); print OUT $output; close(OUT); } sub _dump { # dereference and dump an element my ($t, $ref, $depth)=@_; my ($out,$k,$v)=(''); $depth||=1; return 'undef' if($depth > 128); if(!defined $ref){ return 'undef'; } elsif(ref($ref) eq 'HASH'){ $out.="{\n"; while( ($k,$v)=each %$ref){ next if($k eq ''); $out.= "\t"x$t; $out.=_dumpd($k).' => '; if(ref($v)){ $out.=_dump($t+1,$v,$depth+1); } else { $out.=_dumpd($v); } $out.=",\n" unless( substr($out,-2,2) eq ",\n"); } $out=~s#,\n$#\n#; $out.="\t"x($t-1); $out.="},\n"; } elsif(ref($ref) eq 'ARRAY'){ $out.="["; if(~~@$ref){ $out.="\n"; foreach $v (@$ref) { $out.= "\t"x$t; if(ref($v)){ $out.=_dump($t+1,$v,$depth+1); } else { $out.=_dumpd($v); } $out.=",\n" unless( substr($out,-2,2) eq ",\n"); } $out=~s#,\n$#\n#; $out.="\t"x($t-1); } $out.="],\n"; } elsif(ref($ref) eq 'SCALAR'){ $out.=_dumpd($$ref); } elsif(ref($ref) eq 'REF'){ $out.=_dump($t,$$ref,$depth+1); } elsif(ref($ref)){ # unknown/unsupported ref $out.="undef"; } else { # normal scalar $out.=_dumpd($ref); } return $out; } sub _dumpd { # escape a scalar string my $v=shift; return 'undef' if(!defined $v); return "''" if($v eq ''); return "$v" if($v eq '0' || $v!~tr/0-9//c && $v!~m#^0+# ); if($v!~tr/ !-~//c){ $v=~s/(['\\])/\\$1/g; return "'$v'"; } $v=~s#\\#\\\\#g; $v=~s#"#\\"#g; $v=~s#\r#\\r#g; $v=~s#\n#\\n#g; $v=~s#\0#\\0#g; $v=~s#\t#\\t#g; $v=~s#\$#\\\$#g; $v=~s#([^!-~ ])#sprintf('\\x%02x',ord($1))#eg; return "\"$v\""; } sub encode_base64 { goto &MIME::Base64::encode_base64 if($AVAILABLE{'mime::base64'}); my $res = ""; my $eol = $_[1]; $eol = "\n" unless defined $eol; pos($_[0]) = 0; while ($_[0] =~ /(.{1,45})/gs) { $res .= substr(pack('u', $1), 1); chop($res);} $res =~ tr|` -_|AA-Za-z0-9+/|; my $padding = (3 - length($_[0]) % 3) % 3; $res =~ s/.{$padding}$/'=' x $padding/e if $padding; if (length $eol) { $res =~ s/(.{1,76})/$1$eol/g; } $res; } sub decode_base64 { # ripped from MIME::Base64 goto &MIME::Base64::decode_base64 if($AVAILABLE{'mime::base64'}); my $str = shift; my $res = ""; $str =~ tr|A-Za-z0-9+=/||cd; $str =~ s/=+$//; # remove padding $str =~ tr|A-Za-z0-9+/| -_|; # convert to uuencoded format while ($str =~ /(.{1,60})/gs) { my $len = chr(32 + length($1)*3/4); # compute length byte $res .= unpack("u", $len . $1 ); # uudecode }$res;} sub encode_uri_hex { # normal hex encoding my $str=shift; $str=~s/([^\/])/sprintf("%%%02x",ord($1))/ge; return $str;} sub encode_uri_randomhex { # random normal hex encoding my @T=split(//,shift); my $s; foreach (@T) { if(m#[;=:&@\?]#){ $s.=$_; next; } if((rand()*2)%2 == 1){ $s.=sprintf("%%%02x",ord($_)) ; } else{ $s.=$_; } } return $s; } sub encode_uri_randomcase { my ($x,$uri)=('',shift); return $uri if($uri!~tr/a-zA-Z//); # fast-path my @T=split(//,$uri); for($x=0;$x<(scalar @T);$x++){ if((rand()*2)%2 == 1){ $T[$x]=~tr/A-Za-z/a-zA-Z/; }} return join('',@T); } sub encode_unicode { my ($c,$r)=('',''); foreach $c (split(//,shift)){ $r.=pack("v",ord($c)); } return $r; } sub decode_unicode { my $str = $_[0]; return $str if($str!~tr/!-~//c); # fastpath my ($lead,$count,$idx); my $out=''; my $len = length($str); my ($ptr,$no,$nu)=(0,0,0); while($ptr < $len){ my $c=substr($str,$ptr,1); if( ord($c) >= 0xc0 && ord($c) <= 0xfd){ $count=0; $c=ord($c)<<1; while( ($c & 0x80) == 0x80){ $c<<=1; last if($count++ ==4); } $c = ($c & 0xff); for( $idx=1; $idx<$count; $idx++){ my $o=ord(substr($str,$ptr+$idx,1)); $no=1 if($o != 0x80); $nu=1 if($o <0x80 || $o >0xbf); } my $o=ord(substr($str,$ptr+$idx,1)); $nu=1 if( $o < 0x80 || $o > 0xbf); if($nu){ $out.=substr($str,$ptr++,1); } else { if($no){ $out.="\xff"; # generic replacement char } else { my $prior=ord(substr($str,$ptr+$count-1,1))<<6; $out.= pack("C", (ord(substr($str,$ptr+$count,1) )&0x7f)+$prior); } $ptr += $count+1; } $no=$nu=0; } else { $out.=$c; $ptr++; } } return $out; } sub encode_anti_ids { my ($rhin,$modes)=(shift,shift); my (@T,$x,$c,$s,$y); my $ENCODED=0; my $W = $$rhin{'whisker'}; return if(!(defined $rhin && ref($rhin))); $$rhin{'whisker'}->{'uri_orig'}=$$rhin{'whisker'}->{'uri'}; if($modes=~/4/){$s=''; if($$W{'uri'}=~m#^/#){ $y=&utils_randstr; $s.=$y while(length($s)<512); $$W{'uri'}="/$s/..".$$W{'uri'}; } } if($modes=~/7/){ $$W{'uri'}=encode_uri_randomcase($$W{'uri'}); } if($modes=~/2/){ $$W{'uri'}=~s#/#/./#g; } if($modes=~/8/){ $$W{'uri'}=~s#/#\\#g; $$W{'uri'}=~s#^\\#/#; $$W{'uri'}=~s#^([a-zA-Z0-9_]+):\\#$1://#; $$W{'uri'}=~s#\\$#/#; } if($modes=~/1/){ if($ENCODED==0){ $$W{'uri'}=encode_uri_randomhex($$W{'uri'}); $ENCODED=1;} } if($modes=~/5/){ ($s,$y)=(&utils_randstr,&utils_randstr); $$W{'uri'}="/$s.html%3F$y=/../$$W{'uri'}"; } if($modes=~/3/){ $s=&utils_randstr; $$W{'uri'}="/%20HTTP/1.1%0d%0aAccept%3a%20$s/../..$$W{'uri'}"; } if($modes=~/6/){ $$W{'http_space1'}="\t"; } } %_forms_ELEMENTS=( 'form'=>1, 'input'=>1, 'textarea'=>1, 'button'=>1, 'select'=>1, 'option'=>1, '/select'=>1 ); sub forms_read { my $dr=shift; return undef if(!ref($dr) || length($$dr)==0); my $A = [ {}, [] ]; html_find_tags($dr,\&_forms_parse_callback,0, $A ,\%_forms_ELEMENTS); if(scalar %{$A->[0]}){ push(@{$A->[1]},$A->[0]); } return $A->[1]; } sub forms_write { my $hr=shift; return undef if(!ref($hr) || !(scalar %$hr)); return undef if(!defined $$hr{"\0"}); my $t='
\n"; return $t; } { # these are 'private' static variables for &_forms_parse_html my $CURRENT_SELECT=undef; my $UNKNOWNS=0; sub _forms_parse_callback { my ($TAG, $hr, $dr, $start, $len,$ar)=(lc(shift),@_); my ($saveparam, $parr, $key)=(0,undef,''); my $_forms_CURRENT = $ar->[0]; my $_forms_FOUND = $ar->[1]; if(scalar %$hr){ while( my ($key,$val)=each %$hr ){ if($key=~tr/A-Z//){ delete $$hr{$key}; if(defined $val){ $$hr{lc($key)}=$val; } else { $$hr{lc($key)}=undef; } } } } if($TAG eq 'form'){ if(scalar %$_forms_CURRENT){ # save last form push (@$_forms_FOUND, $_forms_CURRENT); $ar->[0] = {}; $_forms_CURRENT= $ar->[0]; } $_forms_CURRENT->{"\0"}=[$$hr{name},$$hr{method}, $$hr{action},[]]; delete $$hr{'name'}; delete $$hr{'method'}; delete $$hr{'action'}; $key="\0"; $UNKNOWNS=0; } elsif($TAG eq 'input'){ $$hr{type}='text' if(!defined $$hr{type}); $$hr{name}='unknown'.$UNKNOWNS++ if(!defined $$hr{name}); $$hr{value}=undef if(!defined $$hr{value}); $key=$$hr{name}; push @{$_forms_CURRENT->{$key}}, ['input-'.$$hr{type},$$hr{value},[]]; delete $$hr{'name'}; delete $$hr{'type'}; delete $$hr{'value'}; } elsif($TAG eq 'select'){ $$hr{name}='unknown'.$UNKNOWNS++ if(!defined $$hr{name}); $key=$$hr{name}; push @{$_forms_CURRENT->{$key}}, ['select',undef,[]]; $CURRENT_SELECT=$key; delete $$hr{name}; } elsif($TAG eq '/select'){ push @{$_forms_CURRENT->{$CURRENT_SELECT}},['/select',undef,[]]; $CURRENT_SELECT=undef; return undef; } elsif($TAG eq 'option'){ return undef if(!defined $CURRENT_SELECT); if(!defined $$hr{value}){ my $stop=index($$dr,'<',$start+$len); return undef if($stop==-1); # MAJOR PUKE $$hr{value}=substr($$dr,$start+$len, ($stop-$start-$len)); $$hr{value}=~tr/\r\n//d; } push @{$_forms_CURRENT->{$CURRENT_SELECT}},['option',$$hr{value},[]]; delete $$hr{value}; } elsif($TAG eq 'textarea'){ my $stop=$start+$len; do { $stop=index($$dr,'',$stop+2); return undef if($stop==-1); # MAJOR PUKE } while( lc(substr($$dr,$stop+2,8)) ne 'textarea'); $$hr{value}=substr($$dr,$start+$len,($stop-$start-$len)); $$hr{name}='unknown'.$UNKNOWNS++ if(!defined $$hr{name}); $key=$$hr{name}; push @{$_forms_CURRENT->{$key}},['textarea',$$hr{value},[]]; delete $$hr{'name'}; delete $$hr{'value'}; } else { # button $$hr{name}='unknown'.$UNKNOWNS++ if(!defined $$hr{name}); $$hr{value}=undef if(!defined $$hr{value}); $key=$$hr{name}; push @{$_forms_CURRENT->{$key}},['button',$$hr{value},[]]; delete $$hr{'name'}; delete $$hr{'value'}; } if(scalar %$hr){ if($TAG eq 'form'){ $parr = $_forms_CURRENT->{$key}->[3]; } else { $parr = $_forms_CURRENT->{$key}->[-1]; $parr = $parr->[2]; } my ($k,$v); while( ($k,$v) = each(%$hr) ){ if(defined $v){ push @$parr, "$k=\"$v\""; } else { push @$parr, $k; } } } return undef; }} { # contained variables $DR = undef; # data reference $c = 0; # parser pointer $LEN = 0; sub html_find_tags { my ($dataref, $callbackfunc, $xml, $fref, $tagmap)=@_; return if(!(defined $dataref && ref($dataref) )); return if(!(defined $callbackfunc && ref($callbackfunc))); $xml||=0; my ($INTAG, $CURTAG, $LCCURTAG, $ELEMENT, $VALUE, $cc)=(0); my (%TAG, $ret, $start, $tagstart, $tempstart, $x, $found); my $usetagmap = ((defined $tagmap && ref($tagmap)) ? 1 : 0 ); $CURTAG=$LCCURTAG=$ELEMENT=$VALUE=$cc=''; $DR = $dataref; $LEN = length($$dataref); for ($c=0; $c<$LEN; $c++){ $cc=substr($$dataref,$c,1); next if(!$INTAG && $cc ne '>' && $cc ne '<'); if($cc eq '<'){ if($INTAG) { $cc='>'; $c--; } elsif($xml&& $LEN>($c+9)&& substr($$dataref,$c+1,8) eq '![CDATA['){ $c+=9; $tempstart=$c; $found = index($$dataref,']]>',$c); $c=$found+2; $c = $LEN if($found<0); # malformed XML next; } elsif($LEN>($c+3) && substr($$dataref,$c+1,3) eq '!--'){ $tempstart=$c; $c+=4; $found = index($$dataref,'-->',$c); if($found<0){ $found = index($$dataref,'>',$c); $found = $LEN if($found<0); $c = $found; } else { $c=$found+2; } if( $usetagmap==0 || defined $tagmap->{'!--'}){ my $dat=substr($$dataref,$tempstart+4, $found-$tempstart-4); &$callbackfunc('!--',{'='=>$dat}, $dataref, $tempstart, $c-$tempstart+1, $fref); } next; } elsif(!$INTAG) { next if(substr($$dataref,$c+1,1)=~tr/ \t\r\n//); $c++; $INTAG=1; $tagstart=$c-1; $CURTAG=''; while($c<$LEN && ($x=substr($$dataref,$c,1))!~tr/ \t\r\n>=//){ $CURTAG.=$x; $c++;} chop $CURTAG if($xml && substr($CURTAG,-1,1) eq '/'); $c++ if(defined $x && $x ne '>'); $LCCURTAG=lc($CURTAG); $INTAG=0 if($LCCURTAG!~tr/a-z0-9//); next if($c>=$LEN); $cc=substr($$dataref,$c,1); } } if($cc eq '>'){ next if(!$INTAG); if($LCCURTAG eq 'script' && !$xml){ $tempstart=$c+1; pos($$dataref)=$c; if($$dataref!~m#()#ig){ } else { $c=pos($$dataref)-1; my $l = length($1); $TAG{'='}=substr($$dataref,$tempstart, $c-$tempstart-$l+1); } } elsif($LCCURTAG eq 'textarea' && !$xml){ $tempstart=$c+1; pos($$dataref)=$c; if($$dataref!~m#()#ig){ } else { $c=pos($$dataref)-1; my $l=length($1); $TAG{'='}=substr($$dataref,$tempstart, $c-$tempstart-$l+1); } } $INTAG=0; $TAG{'/'}++ if($xml&&substr($$dataref,$c-1,1) eq '/'); &$callbackfunc($CURTAG,\%TAG, $dataref, $tagstart, $c-$tagstart+1, $fref) if( $usetagmap==0 || defined $tagmap->{$LCCURTAG}); $CURTAG=$LCCURTAG=''; %TAG=(); next; } if($INTAG){ $ELEMENT=''; $VALUE=''; pos($$dataref)=$c; if($$dataref!~m/[^ \t\r\n]/g){ $c=$LEN; next; # should we really abort? } $start=pos($$dataref)-1; if($$dataref!~m/[ \t\r\n<>=]/g){ $c=$LEN; next; # should we really abort? } $c=pos($$dataref)-1; if($c > $start){ $ELEMENT=substr($$dataref,$start,$c-$start); chop $ELEMENT if($xml&&substr($ELEMENT,-1,1) eq '/'); } $cc=substr($$dataref,$c,1); if($cc ne '>'){ if($cc=~tr/ \t\r\n//){ $c++ while(substr($$dataref,$c,1)=~tr/ \t\r\n//); } if(substr($$dataref,$c,1) eq '='){ $c++; $start=$c; my $p = substr($$dataref,$c,1); if($p eq '"' || $p eq '\''){ $c++; $start++; $c = index($$dataref,$p,$c); if($c<0){ $c=$LEN; next; } # Bad HTML $VALUE=substr($$dataref,$start,$c-$start); $c++; pos($$dataref)=$c; } else { pos($$dataref)=$c; if($$dataref!~/[ \t\r\n>]/g){ $c=$LEN; } else { $c=pos($$dataref)-1; $VALUE=substr($$dataref,$start,$c-$start); chop $VALUE if($xml&&substr($$dataref,$c-1,2) eq '/>'); } } if(substr($$dataref,$c,1)=~tr/ \t\r\n//){ if($$dataref!~/[^ \t\r\n]/g){ $c=$LEN; next; # should we really abort? } $c=pos($$dataref)-1; } } } # if $c ne '>' $c--; $TAG{$ELEMENT}=$VALUE if($ELEMENT ne '' || ($xml && $ELEMENT ne '/')); } } if($INTAG){ &$callbackfunc($CURTAG,\%TAG, $dataref, $tagstart, $c-$tagstart+1, $fref) if( $usetagmap==0 || defined $tagmap->{$LCCURTAG}); } $DR=undef; # void dataref pointer } sub html_find_tags_rewrite { return if(!defined $DR); my ($pos, $len, $replace_str)=@_; substr($$DR,$pos,$len)=$replace_str; my $l = (length($replace_str)-$len); $c +=$l; $LEN+=$l; } sub _html_find_tags_adjust { my ($p,$l)=@_; $c +=$p; $LEN+=$l; } } # end container sub html_link_extractor { my $data = shift; my $ptr; if(ref($data)){ $ptr = $data; } else { $ptr = \$data; } my %OBJ = ( urls => [], forms => {} ); $OBJ{response}={}; $OBJ{response}->{whisker}={}; $OBJ{response}->{whisker}->{uri}=''; html_find_tags( $ptr, # data \&_crawl_extract_links_test, # callback function 0, # xml mode \%OBJ, # data object \%_crawl_linktags); # tagmap return @{ $OBJ{urls} }; } %http_host_cache=(); sub http_new_request { my %X = @_; my ($k,$v,%RET,%RES); http_init_request(\%RET); while( ($k,$v)=each(%X) ){ $RET{whisker}->{$k}=$v; } $RES{whisker}={}; $RES{whisker}->{MAGIC}=31340; $RES{whisker}->{uri}=''; return (\%RET,\%RES) if wantarray(); return \%RET; } sub http_new_response { my %RET; $RET{whisker}={}; $RET{whisker}->{MAGIC}=31340; $RET{whisker}->{uri}=''; return \%RET; } sub http_init_request { # doesn't return anything my ($hin)=shift; return if(!(defined $hin && ref($hin))); %$hin=(); # clear control hash $$hin{whisker}={ http_space1 => ' ', http_space2 => ' ', version => '1.1', method => 'GET', protocol => 'HTTP', port => 80, uri => '/', uri_prefix => '', uri_postfix => '', uri_param_sep => '?', host => 'localhost', timeout => 10, include_host_in_uri => 0, ignore_duplicate_headers=> 1, normalize_incoming_headers => 1, lowercase_incoming_headers => 0, require_newline_after_headers => 0, invalid_protocol_return_value => 1, ssl => 0, ssl_save_info => 0, http_eol => "\x0d\x0a", force_close => 0, force_open => 0, retry => 1, trailing_slurp => 0, force_bodysnatch => 0, max_size => 0, MAGIC => 31339 }; $$hin{'Connection'}='Keep-Alive'; $$hin{'User-Agent'}="Mozilla (libwhisker/$LW2::VERSION)"; } sub http_do_request { my ($hin, $hout)=(shift,shift); return 2 if(!(defined $hin && ref($hin) )); return 2 if(!(defined $hout && ref($hout))); %$hout=(); $$hout{whisker}={}; $$hout{whisker}->{'MAGIC'}=31340; $$hout{whisker}->{uri}=$$hin{whisker}->{uri}; if(!defined $$hin{whisker} || !defined $$hin{whisker}->{'MAGIC'} || $$hin{whisker}->{'MAGIC'}!=31339 ){ $$hout{whisker}->{error}='Input hash not initialized'; return 2; } if(defined $_[0]){ # handle extra params my %hashref; if(ref($_[0]) eq 'HASH'){ %hashref = %{ $_[0] }; } else { %hashref = @_; } $$hin{whisker}->{$_} = $hashref{$_} foreach (keys %hashref); } if(defined $$hin{whisker}->{'anti_ids'}){ # handle anti_ids my %copy=%$hin; $copy{whisker}={}; %{$copy{whisker}}=%{$$hin{whisker}}; anti_ids(\%copy,$$hin{whisker}->{'anti_ids'}); $hin=\%copy; } my $cache_key=stream_key($hin); my $stream; if(!defined $http_host_cache{$cache_key}){ $stream=stream_new($hin); $http_host_cache{$cache_key}=$stream; } else { $stream=$http_host_cache{$cache_key}; } if(!defined $stream){ $$hout{whisker}->{error}='unable to allocate stream'; return 2; } my $retry_count = $$hin{whisker}->{retry}; my $puke_flag=0; do { # retries wrapper my ($aret,$pass); if(!$stream->{valid}->()){ $stream->{clearall}->(); if(!$stream->{open}->($hin)){ $$hout{whisker}->{error}='opening stream: '. $stream->{error}; $$hout{whisker}->{error} .= '(reconnect problem after prior request)' if($puke_flag); return 2; } if(defined $$hin{whisker}->{proxy_host} && defined $$hin{whisker}->{auth_proxy_callback}){ $aret = $$hin{whisker}->{auth_proxy_callback}->($stream,$hin,$hout); return $aret if($aret != 0); # proxy auth error } if(defined $$hin{whisker}->{auth_callback}){ $aret = $$hin{whisker}->{auth_callback}->($stream,$hin,$hout); return 0 if($aret == 200); # auth not needed? return $aret if($aret != 0); # auth error } _ssl_save_info($hout,$stream) if($$hin{whisker}->{ssl_save_info}>0); } my $ret= _http_do_request_ex($stream,$hin,$hout); $puke_flag++ if($ret==1 && defined($$hout{whisker}->{http_data_sent})); return $ret if($ret==0 || $ret==2); # success or fatal socket error $retry_count--; } while( $retry_count >= 0); return 1; } sub _http_do_request_ex { my ($stream, $hin, $hout, $raw)=@_; return 2 if(! defined $stream); return 2 if(!(defined $hin && ref($hin) )); return 2 if(!(defined $hout && ref($hout))); my $W = $hin->{whisker}; %$hout=(); $$hout{whisker}={}; $$hout{whisker}->{'MAGIC'}=31340; $$hout{whisker}->{uri}=$$hin{whisker}->{uri}; $stream->{clear}->(); if(defined $raw && ref($raw)){ $stream->{queue}->($$raw); } else { $stream->{queue}->( http_req2line($hin) ); if($$W{version} ne '0.9'){ $stream->{queue}->( http_construct_headers($hin) ); $stream->{queue}->($$W{raw_header_data}) if(defined $$W{raw_header_data}); $stream->{queue}->($$W{http_eol}); $stream->{queue}->($$W{data}) if(defined $$W{data}); } # http 0.9 support } if(defined $$W{request_fingerprint}){ $$hout{whisker}->{request_fingerprint}='md5:'.md5($stream->{bufout}) if($$W{request_fingerprint} eq 'md5'); $$hout{whisker}->{request_fingerprint}='md4:'.md4($stream->{bufout}) if($$W{request_fingerprint} eq 'md4'); } if( !$stream->{'write'}->() ){ $$hout{whisker}->{'error'}='sending request: '. $stream->{error}; $stream->{'close'}->(); return 1;} $stream->{writedone}->(); $$hout{whisker}->{http_data_sent}=1; $$hout{whisker}->{'lowercase_incoming_headers'} = $$W{'lowercase_incoming_headers'}; my @H; if($$W{'version'} ne '0.9'){ do { # catch '100 Continue' responses my $resp=_http_getline($stream); if(!defined $resp){ $$hout{whisker}->{error}='error reading HTTP response'; $$hout{whisker}->{data}=$stream->{bufin}; $stream->{'close'}->(); return 1;} $$hout{whisker}->{'raw_header_data'}.=$resp if(defined $$W{'save_raw_headers'}); if($resp!~/^([^\/]+)\/(\d\.\d)([ \t]+)(\d+)([ \t]*)(.*?)([\r\n]+)/){ $$hout{whisker}->{'error'}='invalid HTTP response'; $$hout{whisker}->{'data'}=$resp; while(defined ($_=_http_getline($stream))){ $$hout{whisker}->{'data'}.=$_;} $stream->{'close'}->(); return $$W{'invalid_protocol_return_value'}||1;} $$hout{whisker}->{protocol} = $1; $$hout{whisker}->{version} = $2; $$hout{whisker}->{http_space1} = $3; $$hout{whisker}->{code} = $4; $$hout{whisker}->{http_space2} = $5; $$hout{whisker}->{message} = $6; $$hout{whisker}->{http_eol} = $7; $$hout{whisker}->{'100_continue'}++ if($4 == 100); @H=http_read_headers($stream,$hin,$hout); if(!$H[0]){ $$hout{whisker}->{'error'}='Error in reading headers: ' .$H[1]; $stream->{'close'}->(); return 1; } if( !defined $H[3] ){ # connection $H[3] = (defined $$hin{'Connection'}) ? lc($$hin{'Connection'}) : 'close'; } } while($$hout{whisker}->{'code'}==100); } else { # http ver 0.9, we need to fake it since headers are not sent $$hout{whisker}->{version} ='0.9'; $$hout{whisker}->{code} =200; $$hout{whisker}->{http_message}=''; $H[3]='close'; } if(defined $$W{data_sock}){ $$hout{whisker}->{data_sock} = $stream->{sock}; } else { if($$W{'force_bodysnatch'} || ( $$W{'method'} ne 'HEAD' && $$hout{whisker}->{'code'}!=206 && $$hout{whisker}->{'code'}!=102)){ return 1 if(!http_read_body($stream,$hin,$hout,$H[1],$H[2])); } if(($H[3] ne 'keep-alive' || ( defined $$hin{'Connection'} && $$hin{'Connection'}=~m/close/i)) && $$W{'force_open'}!=1){ $stream->{'close'}->(); } $stream->{'close'}->() if($$W{'force_close'}>0 || $$W{'ssl'}>0); } if(defined $$W{'header_delete_on_success'} && ref($$W{'header_delete_on_success'})){ foreach (@{ $$W{'header_delete_on_success'} }){ delete $hin->{$_} if(exists $hin->{$_}); } delete $$W{header_delete_on_success}; } $stream->{reqs}++; $$hout{whisker}->{'stats_reqs'}=$stream->{reqs}; $$hout{whisker}->{'stats_syns'}=$stream->{syns}; $$hout{whisker}->{'socket_state'}=$stream->{state}; delete $$hout{whisker}->{'error'}; # no error return 0; } sub http_req2line { my ($S,$hin,$UO)=('',@_); $UO||=0; if(defined $$hin{whisker}->{'full_request_override'}){ return $$hin{whisker}->{'full_request_override'}; } else { # notice the components of a request--this is for flexibility if($UO!=1){$S.= $$hin{whisker}->{'method'}. $$hin{whisker}->{'http_space1'}; if($$hin{whisker}->{'include_host_in_uri'}>0){ $S.= 'http://'; if(defined $$hin{whisker}->{'uri_user'}){ $S.= $$hin{whisker}->{'uri_user'}; if(defined $$hin{whisker}->{'uri_password'}){ $S.= ':'.$$hin{whisker}->{'uri_user'}; } $S.= '@'; } $S.= $$hin{whisker}->{'host'}. ':'.$$hin{whisker}->{'port'};}} $S.= $$hin{whisker}->{'uri_prefix'}. $$hin{whisker}->{'uri'}. $$hin{whisker}->{'uri_postfix'}; if(defined $$hin{whisker}->{'parameters'} && $$hin{whisker}->{'parameters'} ne ''){ $S.= $$hin{whisker}->{'uri_param_sep'}. $$hin{whisker}->{'parameters'};} if($UO!=1){ if($$hin{whisker}->{'version'} ne '0.9'){ $S.= $$hin{whisker}->{'http_space2'}. $$hin{whisker}->{'protocol'}.'/'. $$hin{whisker}->{'version'};} $S.= $$hin{whisker}->{'http_eol'};}} return $S;} sub _http_getline { my $stream = shift; my ($str,$t)=('',0); $t = index($stream->{bufin},"\n",0); while($t < 0){ return undef if !$stream->{read}->(); $t=index($stream->{bufin},"\n",0); } my $r = substr($stream->{bufin},0,$t+1); $stream->{bufin} = substr($stream->{bufin},$t+1); return $r; } sub _http_get { # read from socket w/ timeouts my ($stream,$amount) = @_; my ($str,$t)=('',''); while($amount > length($stream->{bufin})){ return undef if !$stream->{read}->(); } my $r = substr($stream->{bufin},0,$amount); $stream->{bufin} = substr($stream->{bufin},$amount); return $r; } sub _http_getall { my ($stream, $max_size) = @_; if($max_size){ while( $stream->{read}->()){ last if(length($stream->{bufin})>$max_size); } } else { 1 while( $stream->{read}->()); } return $stream->{bufin}; } sub http_fixup_request { my $hin=shift; return if(!(defined $hin && ref($hin))); $$hin{whisker}->{uri}='/' if($$hin{whisker}->{uri} eq ''); if($$hin{whisker}->{'version'} eq '1.1'){ $$hin{'Host'}=$$hin{whisker}->{'host'} if(!defined $$hin{'Host'}); $$hin{'Host'}.=':'.$$hin{whisker}->{'port'} if($$hin{whisker}->{port} != 80); $$hin{'Connection'}='Keep-Alive' if(!defined $$hin{'Connection'}); } if(defined $$hin{whisker}->{'data'}){ if(!defined $$hin{'Content-Length'}){ $$hin{'Content-Length'}=length($$hin{whisker}->{'data'});} if(!defined $$hin{'Content-Encoding'}){ $$hin{'Content-Encoding'}='application/x-www-form-urlencoded';} } if(defined $$hin{whisker}->{'proxy_host'}){ $$hin{whisker}->{'include_host_in_uri'}=1;} } sub http_reset { my $stream; foreach $stream (keys %http_host_cache){ next if(!ref($stream)); $stream->{'close'}->(); delete $http_host_cache{$stream}; } } sub _ssl_save_info { my ($hr,$stream)=@_; my $cert; if($stream->{streamtype}==4){ my $SSL = $stream->{sock}; $$hr{whisker}->{ssl_cipher}=Net::SSLeay::get_cipher($SSL); if($cert = Net::SSLeay::get_peer_certificate($SSL)){ $$hr{whisker}->{ssl_cert_subject} = Net::SSLeay::X509_NAME_oneline( Net::SSLeay::X509_get_subject_name($cert) ); $$hr{whisker}->{ssl_cert_issuer} = Net::SSLeay::X509_NAME_oneline( Net::SSLeay::X509_get_issuer_name($cert) ); } } if($stream->{streamtype}==5){ $$hr{whisker}->{ssl_cipher}=$stream->{sock}->get_cipher(); if($cert = $stream->{sock}->get_peer_certificate()){ $$hr{whisker}->{ssl_cert_subject}= $cert->subject_name(); $$hr{whisker}->{ssl_cert_issuer}= $cert->issuer_name(); } } } sub http_read_headers { my ($stream,$in,$hout)=@_; my $W=$in->{whisker}; my ($a,$b,$LC,$CL,$TE,$CO); my $last; while(1){ $last = pos($stream->{bufin}); if($stream->{bufin}!~m/(.*?)[\r]{0,1}\n/g){ if(!$stream->{read}->()){ last if($$W{require_newline_after_headers}==0 && length($stream->{bufin})-1 == $last); return (0,'error reading in all headers'); } pos($stream->{bufin}) = $last; next; } last if($1 eq ''); return (0, 'malformed header') if($1!~m/^([^:]+):([ \t]*)(.*)$/); $$hout{whisker}->{'abnormal_header_spacing'}++ if($2 ne ' '); $a=$1; $b=$3; $LC = lc($a); next if($LC eq 'whisker'); $TE = lc($b) if($LC eq 'transfer-encoding'); $CL = $b if($LC eq 'content-length'); $CO = lc($b) if($LC eq 'connection'); push(@{$$hout{whisker}->{cookies}},$b) if($LC eq 'set-cookie' || $LC eq 'set-cookie2'); if($$W{'lowercase_incoming_headers'}>0){ $a=$LC; } elsif($$W{'normalize_incoming_headers'}>0){ $a=ucfirst($LC); $a='ETag' if($a eq 'Etag'); $a=~s/(-[a-z])/uc($1)/eg; } push(@{$$hout{whisker}->{header_order}},$a); if(defined $$hout{$a} && $$W{ignore_duplicate_headers}!=1){ $$hout{$a} = [ $$hout{$a} ] if(!ref($$hout{$a})); push(@{$$hout{$a}},$b); } else { $$hout{$a}=$b; } } my $found=pos($stream->{bufin}); $$hout{whisker}->{'raw_header_data'}=substr($stream->{bufin},0,$found) if(defined $$W{'save_raw_headers'}); $stream->{bufin} = substr($stream->{bufin},$found); return (1,$TE,$CL,$CO); } sub http_read_body { my ($temp,$stream,$hin,$hout,$enc,$len)=('',@_); my $max_size = $hin->{whisker}->{max_size}||0; if (defined $enc && $enc eq 'chunked') { my $total = 0; my $x; if(!defined ($x=_http_getline($stream))){ $$hout{whisker}->{'error'}='Error reading chunked data length'; $stream->{'close'}->(); return 0;} $a=$x; $a=~tr/a-fA-F0-9//cd; if(length($a)>8){ $$hout{whisker}->{'error'}='Chunked size is too big: '.$x; $stream->{'close'}->(); return 0; } $len=hex($a); if($max_size){ $max_size -= $len; $len = 0 if($max_size < 0); $len = $max_size if($len > $max_size); } $$hout{whisker}->{'data'}=''; while($len>0) { # chunked sucks if(!defined ($temp=_http_get($stream,$len))){ $$hout{whisker}->{'error'}='Error reading chunked data'; $stream->{'close'}->(); return 0;} $$hout{whisker}->{'data'}=$$hout{whisker}->{'data'} . $temp; $total+=$len; if($max_size && $total >= $max_size){ $stream->{'close'}->(); return 1; } $temp=_http_getline($stream); ($temp=_http_getline($stream)) if(defined $temp && $temp=~/^[\r\n]*$/); if(!defined $temp){ $$hout{whisker}->{'error'}='Error reading chunked data'; $stream->{'close'}->(); return 0;} $temp=~tr/a-fA-F0-9//cd; if(length($temp)>8){ $$hout{whisker}->{'error'}='Chunked size is too big: '.$temp; $stream->{'close'}->(); return 0; } $len=hex($temp); $len=($max_size-$total) if($max_size && $len > ($max_size-$total)); } while(defined ($_=_http_getline($stream))){ tr/\r\n//d; last if($_ eq ''); } } else { if (defined $len) { return 1 if($len==0); $len = $max_size if($max_size && $len > $max_size); if(!defined ($$hout{whisker}->{data}=_http_get($stream,$len))){ $$hout{whisker}->{'error'}='Error reading data: '. $stream->{error}; $stream->{'close'}->(); return 0;} } else { # Yuck...read until server stops sending.... $$hout{whisker}->{data}=_http_getall($stream, $max_size); $stream->{'close'}->(); } $$hout{whisker}->{'data'}||=''; } return 1; } sub http_construct_headers { my $hin = shift; my (%SENT, $output); my $EOL=$hin->{whisker}->{http_eol}||"\x0d\x0a"; if(defined $hin->{whisker}->{header_order} && ref($hin->{whisker}->{header_order}) eq 'ARRAY'){ foreach (@{$hin->{whisker}->{header_order}}){ next if($_ eq '' || $_ eq 'whisker'); if(ref($hin->{$_})){ die("non-array header value reference") if(ref($hin->{$_}) ne 'ARRAY'); $SENT{$_}||=0; my $v=$$hin{$_}->[$SENT{$_}]; $output.="$_: $v$EOL"; } else { $output.="$_: $$hin{$_}$EOL"; } $SENT{$_}++; } } foreach (keys %$hin){ next if($_ eq '' || $_ eq 'whisker'); next if(defined $SENT{$_}); if(ref($$hin{$_})){ # header with multiple values my $key=$_; foreach (@{$$hin{$key}}){ $output.="$key: $_$EOL";} } else { # normal header $output.="$_: $$hin{$_}$EOL"; } } return $output; } sub http_close { my $hin=shift; my $cache_key=stream_key($hin); return if(!defined $http_host_cache{$cache_key}); my $stream=$http_host_cache{$cache_key}; $stream->{'close'}->(); } sub http_do_request_timeout { my ($req, $resp, $timeout)=@_; $timeout||=30; my $result; eval { local $SIG{ALRM} = sub { die "timeout\n" }; eval { alarm($timeout) }; $result = LW2::http_do_request($req, $resp); eval { alarm(0) }; }; if($@){ $result=1; $resp->{whisker}->{error}='Error with timeout wrapper'; $resp->{whisker}->{error}='Total transaction timed out' if($@=~/timeout/); } return $result; } { # start md5 packaged varbs my (@S,@T,@M); my $code=''; sub md5 { return undef if(!defined $_[0]); # oops, forgot the data return MD5->hexhash($_[0]) if(defined $AVAILABLE{'md5'}); my $DATA=_md5_pad($_[0]); &_md5_init() if(!defined $M[0]); return _md5_perl_generated(\$DATA); } sub _md5_init { return if(defined $S[0]); my $i; for($i=1; $i<=64; $i++){ $T[$i-1]=int((2**32)*abs(sin($i))); } my @t=(7,12,17,22,5,9,14,20,4,11,16,23,6,10,15,21); for($i=0; $i<64; $i++){ $S[$i]=$t[(int($i/16)*4)+($i%4)]; } @M=( 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 1,6,11,0,5,10,15,4,9,14,3,8,13,2,7,12, 5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2, 0,7,14,5,12,3,10,1,8,15,6,13,4,11,2,9 ); &_md5_generate(); my $TEST=_md5_pad('foobar'); if( _md5_perl_generated(\$TEST) ne '3858f62230ac3c915f300c664312c63f'){ die('Error: MD5 self-test not successful.'); } } sub _md5_pad { my $l = length(my $msg=shift() . chr(128)); $ msg .= "\0" x (($l%64<=56?56:120)-$l%64); $l=($l-1)*8; $msg .= pack 'VV',$l & 0xffffffff, ($l >> 16 >> 16); return $msg; } sub _md5_generate { my $N='abcddabccdabbcda'; my ($i,$M)=(0,''); $M='&0xffffffff' if((1 << 16) << 16); # mask for 64bit systems $code=<