#!/usr/bin/perl use strict; use Parse::AccessLogEntry; use MIME::Base64; my $P=Parse::AccessLogEntry::new(); sub url_unescape { my ($In)=@_; $In =~ s/%([a-fA-F0-9]{2})/chr(hex($1))/ge; $In =~ s/%20/ /g; return $In; } sub url_decode { chomp $_[0]; my %hash; my @parts = split(/&/, $_[0]); for (@parts) { my($a, $b) = split(/=/, $_); $a=url_unescape($a); $b=url_unescape($b); $b=~s/\+/ /g; $hash{$a} = $b; } return \%hash; } sub base64_unencode { my ($in)=@_; my $Ret; print "Input: $in\n"; while($in=~s/^(..)//) { printf "$1 translates to %s, which becomes %s.\n", hex($1), pack 'C', hex($1); $Ret.=ord(hex($1)); } print "Output: $Ret\n"; exit; return $Ret; } my %NewSiteHits; my %SiteHits; my %StringHits; my $devel; my $PersistLinks; my $PersistSearches; my $WritePersist; my %Searches; my %Fail; my %IgnoreReport; my %Report; my %QueryVar; my %HistReports; my %HistSearches; { my @List=('/favicon.ico'); foreach(@List) { $IgnoreReport{$_}='1'; } } my %SkipSite; { my @Forums=( 'jobrelatedstuff.com', 'californiaccw.org', 'www.bayarearidersforum.com', 'www.glocktalk.com', 'www.gunscal.com', 'www.guntards.net', 'www.ak47.net', 'www.ar15.com', 'www.jesseshunting.com', 'www.pirate4x4.com', 'www.linkedin.com', 'www.thefiringline.com', 'www.theboxotruth.com', 'www.thegunstop.com', 'www.tinyurl.com', 'www.socaps.com', 'www.sksboards.com', 'www.arfcom.com', 'www.falfiles.com', 'www.jgsales.com', 'www.akfiles.com', 'surplusrifleforum.com', 'del.icio.us', 'www.surplusrifleforum.com', 'ar15armory.com', 'socalevo.com', 'www.ar15armory.com', 'www.socalevo.net', 'en.wikipedia.org', 'www.clubfrontier.org', 'www.calguns.net', 'www.odcmp.org', 'www.crownvic.net', 'www.topfreeforum.com', 'www.jobrelatedstuff.com', 'www.mnguntalk.com', 'www.sacairsoft.com', 'www.usmessageboard.com', 'www.passiveaggressivenotes.com', 'www.youtube.com', 'www.hondamarketplace.com', 'www.stockleaf.com', 'www.ekartingnews.com', 'www.stumbleupon.com', 'www.pro-2nd.com', 'www.1919a4.com', 'forums.facepunchstudios.com', 'forums.insmod.net', 'www.sitehoppin.com', 'members.fcsa.org', 'www.defensivecarry.com', 'rbp.f0e.net', 'forums.1911forum.com', 'www.californiapredatorsclub.com', 'www.floridashootersnetwork.com', 'www.sigforum.com', 'forum.saiga-12.com', 'www.reeelapse.com', 'www.tngunowners.com', 'board.13bels.net', 'www.gunandgame.com', 'www.handymanwire.com', 'www.socalbubble.com', 'www.4peeps.com', 'www.nethirdgen.org', 'www.socalsvriders.org', 'www.zombiehunters.org', 'www.dezertrangers.com', 'www.gabbly.com', 'www.yamahafz1oa.com', 'www.858airsoft.com' ); foreach(@Forums) { $SkipSite{$_}='report'; } } $SkipSite{'www.clusty.com'}='skip'; $SkipSite{'64.151.69.37'}='skip'; $SkipSite{'www.areth.org'}='skip'; #$SkipSite{'www.calguns.net'}='skip'; $SkipSite{'www.coldwarshooters.net'}='skip'; $SkipSite{'www.monstermangrip.com'}='skip'; $SkipSite{'www.pingdom.com'}='skip'; #$SkipSite{'groups.myspace.com'}='skip'; $SkipSite{'kamidake.areth.org'}='skip'; $SkipSite{'www.munax.com'}='skip'; $SkipSite{'majestic12.co.uk'}='skip'; $SkipSite{'www.sitedossier.com'}='skip'; $SkipSite{'mail.google.com'}='skipbutcount'; $SkipSite{'webmail.att.net'}='skipbutcount'; $SkipSite{'webmail.aol.com'}='skipbutcount'; $SkipSite{'webmail.woh.rr.com'}='skipbutcount'; $SkipSite{'mail.hughes.net'}='skipbutcount'; $SkipSite{'www.scroogle.org'}='skip'; { foreach( # Here, I list all the places which use 'q' as their search term. No sense wasting individual lines. 'search.peoplepc.com','search.live.com','search.earthlink.net','www.search.com','search.msn.com','www.ask.com','search.comcast.net','search.sweetim.com','www.searchalot.com','search.bearshare.com','alltheweb.com','www.crawler.com','searchwithtednugent.prodege.com', 'search.findtarget.com', 'www.blingo.com', 'iwon.ask.com', 'home.knology.net', 'start.shaw.ca', 'search.conduit.com', 'daemon-search.com', 'm.google.com', 'www.att.net', 'www.myembarq.com', 'myembarq.com', 'search.alot.com', 'uk.ask.com', 'www.verizon.net', 'www.armstrongmywire.com', 'www.gigablast.com', 'broadband.zoomtown.com', 'www.l.google.com', 'www.adelphia.net', 'www.daemon-search.com', 'search.pch.com', 'search.msntv.msn.com', 'www.hakia.com', 'info.com' ) { $QueryVar{$_}='q'; } } { foreach( # This block is just like the above block, but dedicated to 'query' 'search.chacha.com', 'search.aol.com', 'search.hp.my.aol.com', 'images.snap.com', 'search.netzero.net', 'www.hotbot.com', 'search.lycos.com', 'websearch.cs.com', 'search.cnn.com', 'aim.search.aol.com' ) { $QueryVar{$_}='query'; } } $QueryVar{'www.kvasir.no'}='searchExpr'; $QueryVar{'crawler.com'}='qkw'; $QueryVar{'finder.cox.net'}='SearchQuery'; $QueryVar{'wwwz.websearch.verizon.net'}='qf'; $QueryVar{'search.rr.com'}='qs'; $QueryVar{'search.mywebsearch.com'}='searchfor'; $QueryVar{'weatherbugbrowserbar.mywebsearch.com'}='searchfor'; $QueryVar{'wwwo.notfounditem.net'}='qo'; $QueryVar{'www.goodsearch.com'}='Keywords'; $QueryVar{'us.m.yahoo.com'}='p'; $QueryVar{'guide.opendns.com'}='url'; $QueryVar{'as.starware.com'}='qry'; $QueryVar{'allplus.com'}='sc'; foreach(@ARGV) { if($_ eq 'devel') { $devel=1; } elsif($_=~m/^persistlinks=(.*)$/) { $PersistLinks=$1; } elsif($_=~m/^persistsearches=(.*)$/) { $PersistSearches=$1; } elsif($_ eq 'write') { $WritePersist=1; } else { print "Usage: getsearchtargets.pl [devel] [persistsearches=] [persistlinks=] [write]\n"; exit; } } if($WritePersist&&(($PersistLinks eq '')&&($PersistSearches eq ''))) { print "ERROR: You cannot specify the 'write' option without specifying a file via the 'persistlinks' or 'persistsearch' flags!\n"; exit; } if($PersistSearches) { open FILE, '<'.$PersistSearches or die "Unable to open file $PersistSearches for read!\nIf you want to start a new tally, try 'touch $PersistSearches'\n"; while() { chomp; $_=~m/^(\d+)=(.*)$/; $HistSearches{$2}=$1; } close FILE; } if($PersistLinks) { open FILE, '<'.$PersistLinks or die "Unable to open file $PersistLinks for read!\nIf you want to start a new tally, try 'touch $PersistLinks'\n"; while() { chomp; $_=~m/^(\d+)=(.*)$/; $HistReports{$2}=$1; } close FILE; } while() { my $H=$P->parse($_); next unless($H->{code} eq '200'); next unless($H->{refer}); next if($H->{refer} eq '-'); my $Refer=$H->{refer}; my $ReferSite; # This gets filled in later. my $ReferString; # This gets filled in later. my $File=$H->{file}; my $Search; #We ignore internal links. next if($Refer=~m/^http:\/\/dev.thegunwiki.com\//i); next if($Refer=~m/^http:\/\/[a-z]*\.?thegunwiki.com/i); next unless($Refer=~s/^http:\/\///); $Refer=~m/(.*?)\/(.*)/; ($ReferSite, $ReferString)=($1, $2); next if($ReferString eq ''); $ReferSite=lc($ReferSite); if(($SkipSite{'www.'.$ReferSite})||($QueryVar{'www.'.$ReferSite})) { # print "Padding out $ReferSite and $Refer due to matching QueryVar or SkipSite entry.\n"; $ReferSite='www.'.$ReferSite; $Refer='www.'.$Refer; } # else { print "Looking for $ReferSite for $Refer; no matching QueryVar or SkipSite entry.\n"; } #We also ignore CSS hits. next if($File=~m/\.css$/i); next if($File=~m/\.ico$/i); next if($File=~m/\/pub\/TWiki\//); next if($File=~m/\/TWiki\/SessionPlugin/); next if($SkipSite{$ReferSite} eq 'skip'); next if($SkipSite{'www.'.$ReferSite} eq 'skip'); if(($SkipSite{$ReferSite} eq 'skipbutcount')|| ($ReferSite=~s/.*\.(mail\.live\.com)/$1/)|| ($ReferSite=~s/.*(webmail).*(\.netzero\.net)/$1$2/)|| ($ReferSite=~s/mail\d+\.(mail.com)/$1/)|| ($ReferSite=~s/.*\.(mail\.yahoo\.com)/$1/)) { next if($IgnoreReport{$File}); $Report{"Total hits from $ReferSite into $File"}++; next; } elsif($SkipSite{$ReferSite} eq 'report') { next if($IgnoreReport{$File}); $Refer=~s/%2F/\//g; $Refer=~s/%3A/:/g; $Refer=~s/\.com\/\/refer\.php/.com\/refer.php/; $Refer=~s/^www\.stumbleupon\.com\/refer\.php\?url=http:\/\//stumbleupon.com\/url\//; $Report{"Reporting link into $File: http://$Refer"}++; $HistReports{"Reporting link into $File: http://$Refer"}++; next; } #If we got this far, it's hopefully a search. my ($PH, $CatchFlag); $ReferString=~s/.*\?//; $PH=url_decode($ReferString); next if($PH->{q}=~m/^cache\:/); $Search=$ReferString; $Search=~s/\+/ /g; if($QueryVar{$ReferSite}) { $Search=$PH->{$QueryVar{$ReferSite}}; } elsif($ReferSite=~m/^wwww\d*\.entry-not-found\.com$/) { $Search=$PH->{qo}; } elsif($ReferSite eq 'images.google.com') { $Search=url_unescape($PH->{prev}); $Search=~s/^.*q=//; $Search=~s/\&.*//; } elsif($ReferSite=~m/\.cox\.net$/) { $Report{"Reporting link into $File: http://$Refer"}++; $CatchFlag='reported'; } elsif($ReferSite=~m/answers\.yahoo\.com$/) { $Refer=~s/^\w+\.?answers\.yahoo\.com\//answers.yahoo.com\//g; $Refer=~s/&show=\d+//; $Report{"Reporting link into $File: http://$Refer"}++; $CatchFlag='reported'; } elsif($ReferSite=~s/^ms.*\.(mysearch\.com)$/$1/) { $Search=$PH->{searchfor}; } elsif($ReferSite=~m/^www\d?.google.[a-z]+/) { #printf "Google q: '%s' from $ReferString\n", $PH->{q}; $Search=$PH->{q}; $Search=$PH->{as_q} if($PH->{as_q}); $Search=$PH->{as_epq} if($PH->{as_epq}); } elsif($ReferSite eq 'www.metacrawler.com') { $Search=$Refer; $Search=~s/\/search\/web\/(.*)\//1/; $Search=url_decode($Search); } elsif($ReferSite=~m/\.ant\.com$/) { $Search=$Refer; $Search=~s/\/\d+$//; $Search=~s/.*\///; $Search=url_unescape($Search); } elsif($ReferSite=~m/.*\.mamma\.com/) { $Search=$PH->{query}; } elsif($ReferSite=~m/\.myway\.com$/) { $Search=$PH->{searchfor}; } elsif($ReferSite=~m/^www.*.charter.net/) { $Search=$PH->{qo}; $Search=$PH->{qf} if($PH->{qf}); } elsif($ReferSite eq 'home.bellsouth.net') { $Search=$PH->{search}; unless($Search) { $Search=$PH->{string}; } } elsif($ReferSite eq 'my.att.net') { $Search=$PH->{string}; unless($Search) { $Search=$PH->{string}; } } elsif($ReferSite=~m/\.altavista\.com$/) { $Search=$PH->{q}; unless($Search) { $Search=$PH->{aqp}; } unless($Search) { $Search=$PH->{aqa}; } } elsif($ReferSite=~m/^search\.?\S*\.netscape\.com$/i) { $Search=$PH->{query}; } elsif(($ReferSite=~m/aolsearcht\d?\.search\.aol\.com/)||($ReferSite=~m/aolsearch\.[0-9]+\.search\.aol\.com/)||($ReferSite=~m/^aolsearcht?.aol.com$/)||($ReferSite eq 'aolsearch.aol.ca')||($ReferSite eq 'aolsearch.aol.co.uk')) { $Search=$PH->{query}; unless($Search) { $Report{"AOL encoded link into $File"}++; $CatchFlag='reported'; } } elsif($ReferSite eq 'search.juno.com') { $Search=$PH->{query}; } elsif($ReferSite=~m/.*\.?search.yahoo.co.jp/) { $Search=$PH->{p}; } elsif($ReferSite=~m/.*\.?search.yahoo.com/) { $Search=$PH->{p}; #Here, I deal with URL-encoding searchers } elsif($ReferSite=~m/www\.msplinks\.com$/) { $Search=$H->{refer}; $Search=~s/.*\///; $Search=decode_base64($Search); $Search=~s/^\d\d//; } elsif(($ReferSite eq 'msxml.excite.com')||($ReferSite eq 'dpxmldsl.verizon.net')) { $H->{refer}=~m/\/search\/web\/(.*?)\//; $Search=url_unescape($1); $Search=~s/\+/ /g; } elsif($ReferSite=~m/^search\d+\.info\.com$/) { $H->{refer}=~m/\.info\.com\/(.*?)\?/; $Search=url_unescape($1); $Search=~s/\+/ /g; } elsif($ReferSite eq 'www.dogpile.com') { $H->{refer}=~m/\/ws\/results\/Web\/(.*?)\//; $Search=url_unescape($1); $Search=~s/\+/ /g; } elsif($ReferSite eq 'www.searching.uk.com') { $H->{refer}=~m/\.com\/(.+)\.html$/; $Search=url_unescape($1); $Search=~s/_/ /g; $Search=~s/\+/ /g; } else { $Fail{"$File: Unparseable referral code from $ReferSite: http://$Refer"}++; $NewSiteHits{$ReferSite}++; $Search=''; } #$SiteHits{"'$Search' and got $File"}++; $Search=lc($Search); $Search=~s/\s+/ /g; $Search=~s/^\s+//; $Search=~s/\s+$//; if(($CatchFlag ne '')||($Search eq '')) { $Fail{"Url http://$Refer linked into $File"}++ unless($CatchFlag ne ''); } elsif($devel) { $SiteHits{"$ReferSite asked '$Search' and got $File"}++; } else { $File=~s/^\/pub//; $File=~s/^\/Gunwiki\///; $SiteHits{" * *$Search*: $File"}++; } } print "The TGW Mention Reports:\n"; foreach(sort { $Report{$b}<=>$Report{$a} } keys %Report) { if($HistReports{$_} eq $Report{$_}) { printf " *NEW* %-s : %s\n", $_, $Report{$_}; } else { printf " %-s : %s/%s\n", $_, $Report{$_}, $HistReports{$_}; } } print "\nUndefined-search-string site hits:\n"; foreach(sort { $NewSiteHits{$b}<=>$NewSiteHits{$a} } keys %NewSiteHits) { printf " %-s : %s\n", $_, $NewSiteHits{$_}; } print "\nNEW search-string site hits:\n"; foreach(sort { $SiteHits{$b}<=>$SiteHits{$a} } keys %SiteHits) { $HistSearches{$_}+=$SiteHits{$_}; printf " %-s : %s\n", $_, $SiteHits{$_} if($HistSearches{$_} eq $SiteHits{$_}); } print "\nPreviously-encountered search string site hits:\n"; foreach(sort { $SiteHits{$b}<=>$SiteHits{$a} } keys %SiteHits) { printf " %-s : %s/%s\n", $_, $SiteHits{$_}, $HistSearches{$_} unless($HistSearches{$_} eq $SiteHits{$_}); } print "\nFailures to parse:\n"; foreach(sort { $Fail{$b}<=>$Fail{$a} } keys %Fail) { printf " %-s : %s\n", $_, $Fail{$_}; } if($WritePersist&&($PersistSearches ne '')) { open FILE, '>'.$PersistSearches or die "ERROR: Cannot open $PersistSearches for output!\n"; foreach(keys %HistSearches) { printf FILE "%s=%s\n", $HistSearches{$_}, $_; } close FILE; } else { print "Warning: Data write NOT enabled for persistent storage of incoming searches.\n"; } if($WritePersist&&($PersistLinks ne '')) { open FILE, '>'.$PersistLinks or die "ERROR: Cannot open $PersistLinks for output!\n"; foreach(keys %HistReports) { printf FILE "%s=%s\n", $HistReports{$_}, $_; } close FILE; } else { print "Warning: Data write NOT enabled for persistent storage of incoming links.\n"; }