Linkchecker

By Johannes la Poutre Last update May 15, 2006 — Installed 2,535 times. Daily Installs: 4, 2, 1, 0, 1, 1, 2, 2, 3, 0, 1, 5, 4, 6, 1, 0, 0, 0, 0, 1, 0, 1, 2, 1, 0, 0, 0, 6, 0, 1, 1, 3
/* vim: ts=4 noet ai :
$Id: $

Linkchecker - (c) 2006 J.Q. la Poutre

This script spiders all links on a web page and tries if they 
can be followed.

Somewhat more detailed:
Every HTTP link is polled with a HEAD request, and, depending on the 
returned status code, the link is visually marked (by color and title)
according to the result.

Some remarks
- pages in frames are omitted
- only HTTP(S) links are checked
- linked images are replaced by their ALT text (or "img")
- the script uses a spider pool of max. 4 simultaneous requests
- time out per request is handled by the browser, wchich can take some time
- check for the NoScript extension settings if "Initializing..." takes forever.
- relative links don't work in locally loaded files (GM security limitation)


LICENSE
=======

This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 2 of the License, or (at your
option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA


CHANGELOG
=========

TODO:
	- smart handling of frame sites
	- use <base href...>
	- parse <area href...> elements
	- parse "invisible" link elements: <link rel=...> etc.

Version 1.00
    - initial release
    

*/
// ==UserScript==
// @name           Linkchecker
// @namespace      http://joe.lapoutre.com/BoT/Javascript
// @description    Check all links on a web page
// @include        *testpage*
// @version	       1.00
// ==/UserScript==


// global object
var gLinkchecker = {
	MAXREQ:   4,    // max. number of simultaneous page requests
	links: [],      // link objects
	requests: 0,    // number of active XHR's
	errors:   0,    // number of erroneous links
	interval: null, // heartbeat interval
	populate: function() {
		this.replaceImgs();
		var ll = document.evaluate("//a[@href]", document, null,
			XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE, null);
		for (var i = 0; i < ll.snapshotLength; i++) {
			var l = ll.snapshotItem(i);
			href = l.getAttribute("href");
			l.style.background = "white none";
			l.style.color = "darkblue";
			l.textDecoration = "underline";
			l.setAttribute("title", "Link wil be checked");
			// filter out javascript: mailto: ftp:// etc. links
			if (href.match(/^(?!http)[^\/:]{2,}:/i)) {
				l.setAttribute("title", "Link type not checked");
				l.style.backgroundColor = "silver";
				continue;
			}
			// skip links to internal anchors
			if (href.indexOf('#') == 0) {
				l.setAttribute("title", "Internal anchor, not ckecked");
				l.style.backgroundColor = "silver";
				continue;
			}
			this.links.push(new Link(l, href));
		}
	},
	replaceImgs: function() {
		var ll = document.evaluate("//a[@href]/img", document, null,
			XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE, null);
		for (var i = 0; i < ll.snapshotLength; i++) {
			var l = ll.snapshotItem(i);
				var txt = "[" + (l.getAttribute("alt") || "img") + "]";
				var tn = document.createTextNode(txt);
				l.parentNode.replaceChild(tn, l);
		}
	},
	evtStart: function() {
		var btn = document.getElementById("gm_lichckr_btn");
		btn.setAttribute("value", "Initializing...");
		btn.style.backgroundColor = "orange";
		btn.removeEventListener('click', gLinkchecker.evtStart, true);
		btn.addEventListener('click', gLinkchecker.evtStop, true);
		gLinkchecker.populate();
		gLinkchecker.interval = setInterval(heartBeat, 333);
	},
	evtStop: function() {
		if (window.confirm("Stop Link Checker?")) {
			clearInterval(gLinkchecker.interval);
			// output results right away
			var btn = document.getElementById("gm_lichckr_btn");
			btn.setAttribute("value", "Resume");
			btn.style.backgroundColor = "lime";
			btn.removeEventListener('click', gLinkchecker.evtStop, true);
			btn.addEventListener('click', gLinkchecker.evtResume, true);
		}
	},
	evtResume: function() {
		gLinkchecker.interval = setInterval(heartBeat, 200);
		var btn = document.getElementById("gm_lichckr_btn");
		btn.setAttribute("value", "Resuming...");
		btn.style.backgroundColor = "orange";
		btn.removeEventListener('click', gLinkchecker.evtResume, true);
		btn.addEventListener('click', gLinkchecker.evtStop, true);
	},
	evtDone: function() {
		var btn = document.getElementById("gm_lichckr_btn");
		btn.setAttribute("value", "Found " + gLinkchecker.errors +
			" bad link" + ((gLinkchecker.errors != 1) ? "s" : ""));
		btn.setAttribute("disabled", "disabled");
		btn.style.backgroundColor = "lime";
	},
	initialize: function() {
		// work in main window only (too many iframe crap sites, sorry)
		if (window != top) return;
	
		// "start" button on page
		var ovl = document.createElement("input");
		ovl.setAttribute("id", "gm_lichckr_btn");
		ovl.setAttribute("type", "button");
		ovl.setAttribute("value", "Check links");
		ovl.style.position = "fixed";
		ovl.style.zIndex = 99999; // insane, sometimes needed though
		ovl.style.top = "12px";
		ovl.style.right = "12px";
		ovl.style.backgroundColor = "lime";
		// start watching request queue every interval period
		ovl.addEventListener('click', gLinkchecker.evtStart, true);
		document.getElementsByTagName("body")[0].appendChild(ovl);
	}
};

/*
 * returns a closure with embedded object reference
 * see: http://jibbering.com/faq/faq_notes/closures.html#clObjI
 */
function getProcessFunc(obj) {
	return (function(res) {
		// for more HTTP 1.1 status codes, see
		// http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
		switch (res.status) {
			// OK results
			case 200:
			case 302: // redirect of kinds
			case 303:
			case 304: // not modified
				// the 3XX family is handled by XHR, 
				// not likely that we see these codes
				obj.elt.style.backgroundColor = "#7CFC00";
				break;
			// authorization required
			case 401: // only when no valid credentials were used
				obj.elt.style.backgroundColor = "yellow";
				break;
			// likely persistent errors
			case 403: // forbidden
			case 404: // not found
			case 500: // internal server error
			case 501: // not implemented
			case 502: // bad gateway
				obj.elt.style.backgroundColor = "#FF6347";
				gLinkchecker.errors++;
				break;
			// possibly transient errors
			case 405: // method not allowed
			case 503: // service unavailable
				obj.elt.style.backgroundColor = "orange";
				gLinkchecker.errors++;
				break;
			case "XHR": // XHR throwed something
				obj.elt.style.backgroundColor = "silver";
				break;
			default:
		}
		obj.elt.setAttribute("title", 
			(res.status || 'Status:') + " " +
			(res.statusText || 'Unknonwn'));
		// flag this request as complete, decrease counter
		gLinkchecker.requests--;
		// now do away with the link object, to give garbage collection a chance
		obj = null;
	});
}

// link object
function Link(elt, href) {
	this.elt = elt;
	this.href = mkAbsolute(href);
	// do a HEAD request
	this.ping = function() {
		try {
			var func = getProcessFunc(this);
			// XHR, greasemonkey version works cross site.
			GM_xmlhttpRequest(
				{
					method: 'HEAD',
					url: this.href,
					headers: {
							'User-Agent': 'Mozilla/5.0 (compatible)',
							'Referer': window.location.href
					},
					onload: func,
					// weird behavior of XHR wit HEAD in Firefox,
					// apparently the error handler is called,
					// when the request works perfectly fine
					onerror: func
				});
				gLinkchecker.requests++;
		} catch (e) {
			func({status: "XHR", statusText: e.toString()});
		}
	};

}

// make link absolute,
// offset by current window.location.href
// FIXME: or the <base> element
function mkAbsolute(rel) {
	// weird slashdot, omits protocol part, 
	// like "//ask.slashdot.org/path"
	if (rel.indexOf("//") == 0) {
		rel = window.location.protocol + rel;
	}
	// not relative at all:
	if (rel.match(/^https?:/i)) return rel;
	var base = window.location.protocol + "//" + window.location.host;
	// absolute from document root (starts with slash):
	if (rel.indexOf("/") == 0) {
		return base + rel;
	}
	var b = window.location.pathname.split("/");
	// strip the page component, or last slash of path name:
	b.pop();
	var r = rel.split("/");
	// strip off one level for any "../" sequence
	while (r[0] == "..") {
		r.shift();
		b.pop();
	}
	return base + b.join("/") + "/" + r.join("/");
}


function heartBeat() {
	if ((gLinkchecker.requests < gLinkchecker.MAXREQ) &&
		gLinkchecker.links.length) {
		// update remaining links
		var btn = document.getElementById("gm_lichckr_btn");
		btn.setAttribute("value", "Spidering (" + gLinkchecker.links.length + ")..."); 
		// work: shift next URL from stack
		gLinkchecker.links.shift().ping();
	}
	if ((gLinkchecker.requests <= 0) && (gLinkchecker.links.length == 0)) {
		clearInterval(gLinkchecker.interval);
		// output results after last request has been processed
		//outputToTab(gLinkchecker.toDotString());
		gLinkchecker.evtDone();
	}
}



gLinkchecker.initialize();

// end user script