#!/bin/sh # Born Nov 2 1999 # Wed Nov 3 13:45:33 EDT 1999 # Fixed lack of Host: hostname to make virtual hosts based on hostname work # (as in www.mozillazine.org) # Thu Feb 17 00 # Can't use content-length because dynamic pages wouldn't allow for it # We've got some IMG SRC parsing - alpha quality but ok # Getting better (at 5am) and seems solid enough # Damn yahoo uses SRC=filename with no quotes! # And now, LINK processing - has to be redone into an awk function, but # it's looking okay for now # Wed May 17 00 - released v0.1 # Chucked HTTP 1.1 and added background processing # Fri Jul 21 00 - released v0.2 # Integrated the awk script into this file (damned sprintfs for quotes added # too) and added -v and -k processing. Preparing for the holy rewrite. # Tue Aug 15 00 - released v0.31 # Fixed stupid bug with relative paths; possibly still broken (still # checking) - looks okay to me # Lacks CSS background-image processing # Lacks FRAME processing (ugh) # Will always lack: # Javascript - die, monster. # META REFRESH processing # Location processing httphead() { /bin/echo -n -e "GET /$2$1 HTTP/1.0\r\nHost: $hostname\r\nAccept: text/html, text/plain, audio/*, image/*, */*;q=0.01\r\nAccept-Encoding: gzip, compress\r\nAccept-Language: en\r\nUser-Agent: Ionax\r\n\r\n" ; } urlhost() { echo $1 | grep -qs // # if we get SRC="aaa/aaa" if [ $? -eq 0 ] ; then # then no host echo $1 | cut -d/ -f 3 | cut -d: -f1 # was specified else echo $hostname # return default host fi } urlport() { echo $1 | grep -qs // if [ $? -eq 0 ] ; then # SRC="http://" echo $1 | cut -d/ -f 3 | grep -qs : if [ $? -eq 0 ] ; then # SRC="http://xxx:yy/" echo $1 | cut -d/ -f 3 | cut -d: -f2 else echo 80 fi else echo 80 fi } urldir() { # outputs "dir1/dir2/" from url tmp=$1 echo $tmp | grep -qs // if [ $? -eq 0 ] ; then # SRC="http://" tmp=`echo $tmp | cut -d/ -f4-` fi echo $tmp | awk -vFS="/" '{for (i=1;i" echo "ionax: where's da input?" echo "ionax: full URL please, and http only." echo "ionax: -v for verbose, -k to keep files pulled" echo "" exit fi hostname=`lochost $1` port=`locport $1` file=`locfile $1` dir=`urldir $1` tmpid=$hostname if [ $verbose -gt 0 ]; then echo "Host: "$hostname echo "Port: $port" echo "File: /$file" echo "Dir: $dir" fi mkdir /tmp/ionax.$tmpid size=`httphead $file | nc $hostname $port | tee /tmp/ionax.$tmpid/html | wc --bytes ` n=0 total=$size if [ $verbose -gt 0 ]; then echo "HTML Size: $size" echo "--" ; fi files=`awk -- ' BEGIN { RS="<" } toupper($0) ~ /^LINK/ { p = index(toupper($0),"HREF="); s = substr ( $0, p+5, length($0)); if ( ( substr ( s , 1 , 1 ) == "\"" ) || ( substr ( s, 1, 1) == sprintf("%c", 39) ) ) { split ( s, str, /"/ ); print str[2]; } else { # if it evil src=aaa split ( s , str, / / ); if ( substr ( str[1] , length ( str[1] ) , 1 ) == ">" ) { # if it ends with a \> attached str[1] = substr ( str[1] , 1 , length ( str[1] ) - 1 ); } print str[1]; } } toupper($0) ~ /^IMG/ { p = index(toupper($0),"SRC="); s = substr ( $0, p+4, length($0)); if ( ( substr ( s , 1 , 1 ) == "\"" ) || ( substr ( s, 1, 1) == sprintf("%c", 39) ) ) { split ( s, str, /"/ ); print str[2]; } else { # if it evil src=aaa split ( s , str, / / ); if ( substr ( str[1] , length ( str[1] ) , 1 ) == ">" ) { # if it ends with a \> attached str[1] = substr ( str[1] , 1 , length ( str[1] ) - 1 ); } print str[1]; } } toupper($0) ~ /BACKGROUND=/ { p = index(toupper($0),"BACKGROUND="); s = substr($0, p, length($0)-p); split ( s, str, /"/ ); print str[2]; } ' /tmp/ionax.$tmpid/html | sort | uniq` echo $files > /tmp/ionax.$tmpid/depend for f in $files ; do hl=`urlhost $f` pl=`urlport $f` fl=`urlfile $f` size=`httphead $fl $dir | nc $hl $pl | tee /tmp/ionax.$tmpid/inline.$n | wc --bytes` total=`expr $total + $size` n=`expr $n + 1` if [ $verbose -gt 0 ]; then echo "File $fl @ $size bytes" echo "Total $total bytes ($n inline files)" echo "--" ; fi done echo "Total $total bytes ($n inline files)" if [ $nokeep -gt 0 ]; then rm -rf /tmp/ionax.$tmpid fi