Mega Code Archive

 
Categories / Php / MySQL Database
 

Identify and log search engine access (spiders, robots, etc.) to a page

<? /**** user variables ****/ $log_file = "/home/joe/log/spiderlog"; /* create this file and chmod 777 */ $date = date("d.M.Y H:i:s", time()); /* Major search engines match either $spider_footprint or $spider_ip. */ /* If no match can be made, popular browsers are sorted out and the */ /* request is supposed to be from a smaller search engine. */ $spider_footprint = array("cooter", "lurp", "rawler", "pider", "obot", "eek", "oogle", "canner", "rachnoidea", "ulliver", "arvest", "ireball", "idewinder"); $spider_ip = array("204.123.", "204.74.103.", "203.108.10.", "195.4.183.", "195.242.46.", "198.3.97.", "204.62.245.", "193.189.227.", "209.1.12.", "204.162.96.", "204.162.98.", "194.121.108.", "128.182.72.", "207.77.91.", "206.79.171.", "207.77.90.", "208.213.76.", "194.124.202.", "193.114.89.", "193.131.74.", "131.84.1.", "208.219.77.", "206.64.113.", "195.186.1.", "195.3.97.", "194.191.121.", "139.175.250.", "209.73.233.", "194.191.121.", "198.49.220.", "204.62.245.", "198.3.99.", "198.2.101.", "204.192.112.", "206.181.238", "208.215.47.", "171.64.75.", "204.162.98.", "204.162.96.", "204.123.9.52", "204.123.2.44", "204.74.103.39", "204.123.9.53", "204.62.245.", "206.64.113.", "194.100.28.20", "204.138.115.", "94.22.130.", "164.195.64.1", "205.181.75.169", "129.170.24.57", "204.162.96.", "204.162.96.", "204.162.98.", "204.162.96.", "207.77.90.", "207.77.91.", "208.200.146.", "204.123.9.20", "204.138.115.", "209.1.32.", "209.1.12.", "192.216.46.49", "192.216.46.31", "192.216.46.30", "203.9.252.2"); $browser_footprint = array("95", "98", "MSIE", "NT", "Opera", "16", "32", "MAC", "Mac", "X11", "WebTV", "OS", "Lynx", "IBrowse", "IWENG", "PRODIGY", "Mosaic", "InterGO", "Gold", "zzZ", "Mozzarella", "Vampire", "Cache", "libwww", "TURBO", "WebCompass", "Konqueror", "Wget", "Amiga"); /* 'Mozilla' is not included, for many spiders pretend to be a Mozilla */ /* browser. Regular Mozilla browsers include almost always one of the */ /* above footprints additionally. */ /**** spider or not? ********/ $agent = getenv('HTTP_USER_AGENT'); $host_ip = getenv('REMOTE_ADDR'); $is_spider = 0; $i = 0; while ($i < (count($spider_footprint))) { if (strstr($agent, $spider_footprint[$i])) { $is_spider = 1; break; } $i++; } if ($is_spider != 1) { $i = 0; while ($i < (count($spider_ip))) { if (strstr($host_ip, $spider_ip[$i])) { $is_spider = 1; break; } $i++; } } if ($is_spider != 1) { $is_spider = 1; /* when in doubt, it's logged */ $i = 0; while ($i < (count($browser_footprint))) { if (strstr($agent, $browser_footprint[$i])) { $is_spider = 0; break; } $i++; } } /*** log the spider access ***/ if ($is_spider = 1) { $host_name = @GetHostByAddr($host_ip); $readit = file($log_file); $i = 0; while ($i < (count($readit))) { $alreadythere = $alreadythere.$readit[$i]; $i++; } $newentry = "URL: $PHP_SELF, Date: $date, RemoteHost: $host_name ($host_ip), UserAgent: $agent\n"; $fp = fopen($log_file, "w" ); fputs($fp, $alreadythere.$newentry); fclose($fp); } ?>