By Johan Sundström
Has 118 other scripts.
// ==UserScript==
// @name Unpaginate pagination microformated web pages
// @namespace http://code.google.com/p/ecmanaut/
// @url http://userscripts.org/scripts/source/23175.user.js
// @description Unpaginates pages marked-up with the pagination microformat.
// @include http://*
// ==/UserScript==
// console.info("consume %x", location.href);
// Example usage:
// getMetainfo({ link: ["next"],
// meta: ["items-xpath", "pagination-container"] })
// yields { next: <url>, "items-xpath": <xpath expr> } for a page equipped
// with a microformat with a /html/head/link[@rel="next" and @href] and a
// /html/head/meta[@name="items-xpath" and @content] tag.
function getMetainfo(specs, doc) {
function getOne(tag, attr, doc) {
var node = $X('/html/head/'+ tag +'[@'+ find[tag] +'="' + attr +'"]', doc);
return node && node[pick[tag]];
}
var find = { link: "rel", meta: "name" };
var pick = { link: "href", meta: "content" };
if (typeof specs == "string")
return getOne.apply(this, [].slice.call(arguments));
var info = {};
for (var tag in specs) {
var values = specs[tag];
for (var i = 0; i < values.length; i++) {
var attr = values[i];
var value = getOne(tag, attr, doc);
if (value)
info[attr] = value;
}
}
return info;
}
// list nodes matching this expression, optionally relative to the node `root'
function $x( xpath, root ) {
var doc = root ? root.evaluate ? root : root.ownerDocument : document, next;
var got = doc.evaluate( xpath, root||doc, null, 0, null ), result = [];
switch (got.resultType) {
case got.STRING_TYPE:
return got.stringValue;
case got.NUMBER_TYPE:
return got.numberValue;
case got.BOOLEAN_TYPE:
return got.booleanValue;
default:
while (next = got.iterateNext())
result.push( next );
return result;
}
}
function $X( xpath, root ) {
var got = $x( xpath, root );
return got instanceof Array ? got[0] : got;
}
// Fetches url, turns it into an HTML DOM, and then invokes cb(dom, url, xhr).
// If runGM is set to true and the url is on the same domain as location.href,
// the loaded document will first be processed by all GM scripts thatt apply.
function wget( url, cb/*( dom, url, xhr )*/, runGM ) {
if (html2dom[url]) // cache hit?
return html2dom(null, cb, url, null, runGM);
GM_xmlhttpRequest({ method:'GET', url:url, onload:function( xhr ) {
if (xhr.responseXML)
cb( xhr.responseXML, url, xhr );
else
html2dom( xhr.responseText, cb, url, xhr, runGM );
}});
}
function mayCommunicate(url1, url2) {
function beforePath(url) {
url = url.match(/^[^:]+:\/*[^\/]+/);
return url && url[0].toLowerCase();
}
return beforePath(url1) == beforePath(url2);
}
// Well-behaved browers (Opera, maybe WebKit) could use this simple function:
// function html2dom( html, cb/*( xml, url, xhr )*/, url, xhr ) {
// cb( (new DOMParser).parseFromString(html, "text/html"), url, xhr );
// }
// Firefox doesn't implement (new DOMParser).parseFromString(html, "text/html")
// (https://bugzilla.mozilla.org/show_bug.cgi?id=102699), so we need this hack:
function html2dom( html, cb/*( xml, url, xhr )*/, url, xhr, runGM ) {
function loaded() {
doc = cached.doc = iframe.contentDocument;
iframe.removeEventListener("load", loaded, false);
doc.removeEventListener("DOMContentLoaded", loaded, false);
var callbacks = cached.onload;
delete cached.onload;
//console.log("DOMContentLoaded of %x: cb %x", url, callbacks);
setTimeout(function() { // avoid racing with GM's DOMContentLoaded callback
//console.log("Running %x callbacks", url);
callbacks.forEach(function(cb,i) { cb( doc, url, xhr ); });
}, 10);
};
var cached = html2dom[url]; // cache of all already loaded and rendered DOM:s
if (cached)
if (cached.onload)
return cached.onload.push(cb);
else
return cb(cached.doc, cached.xhr, url);
var iframe = document.createElement("iframe");
iframe.style.height = iframe.style.width = "0";
iframe.style.visibility = "hidden";
iframe.style.position = "absolute";
document.body.appendChild(iframe);
iframe.addEventListener("load", loaded, false);
html2dom[url] = cached = { onload:[cb], xhr:xhr };
if (runGM && mayCommunicate(url, location.href))
return iframe.src = url; // load through GM (should be cached due to xhr)
//console.log("May not communicate / GM scripts unwanted! (%x)", runGM);
html = html.replace(/[\n\r]+/g, " "). // needed not freeze up(?!)
replace(/<script.*?<\/script>/ig, ""). // no code execution on injection!
replace(/<body(\s+[^="']*=("[^"]*"|'[^']*'|[^'"\s]\S*))*\s*onload=("[^"]*"|'[^']*'|[^"']\S*)/ig, "<body$1" );
iframe.contentWindow.location.href = location.href; // for cross domain issues
var doc = iframe.contentDocument;
doc.open("text/html");
doc.addEventListener("DOMContentLoaded", loaded, false);
doc.write(html); // this may throw weird errors we can't catch or silence :-|
doc.close();
}
html2dom.destroy = function() {
for (var url in html2dom)
if (html2dom.hasOwnProperty(url)) {
var cache = html2dom[url];
cache.doc = cache.onload = cache.xhr = null;
delete html2dom[url];
}
};
// functionally belongs to html2dom above (see location.href line for details)
try { // don't run this script recursively on wget() documents on other urls
if (window.frameElement &&
window.parent.location.href.replace(/#.*/, "") == location.href)
return; // console.warn("Avoiding double firing on %x", location.href);
} catch(e) {
//console.error("Double fire check error: %x", e);
}
window.addEventListener("unload", html2dom.destroy, false);
var mainIndexPath, mainItemsPath, next, last, seen = {};
if (!init()) // if microformat producers have not run yet, retry on load
addEventListener("load", init, false);
function init() {
mainIndexPath = getMetainfo("meta", "pagination-container"); // optional
mainItemsPath = getMetainfo("meta", "items-xpath");
if (next = getMetainfo("link", "next")) {
if (mainItemsPath) {
if (last = getLastItem(mainItemsPath))
return listen(), true;
else
console.log("Unpaginator found no items matching %x", mainItemsPath);
}
//else console.warn("Unpaginator found no items-xpath meta tag: aborting.");
}
//else console.info("Unpaginator: entountered last page.");
}
function listen() {
document.addEventListener("scroll", maybeFetch, false);
//console.info("on");
maybeFetch();
}
function deafen() {
document.removeEventListener("scroll", maybeFetch, false);
//console.warn("off");
}
function getLastItem(xpath) {
return xpath && $X("("+ xpath +")[last()]");
}
function maybeFetch() {
//console.info("maybeFetch()");
if (last && coordsOf(last).y > pageYOffset + innerHeight * 1.5)
return; // not preload time yet
deafen(); // only one xhr at a time
if (!next) { // may happen in the cross-domain case
var xpath = getMetainfo("meta", "next-xpath");
var a = xpath && $X(xpath);
next = a && a.href;
}
if (next && !seen[next]) { // fetch (and reregister the unregistered listener)
seen[next] = true;
wget(next, inject, true); // true: should try to run GM scripts on it first
}
}
function coordsOf(node) {
if (typeof node.offsetLeft == "undefined" && node.parentNode)
return coordsOf(node.parentNode);
var x = 0, y = 0;
do {
x += node.offsetLeft;
y += node.offsetTop;
} while (node = node.offsetParent);
return { x:x, y:y };
}
function inject(doc, url, xhr) {
// If there is a pagination container, rewrite current one from the next page
var mainIndex = $X(mainIndexPath);
var nextIndex = getMetainfo("meta", "pagination-container", doc) ||
mainIndexPath; // in the cross-domain case, we may have to guess.
if (nextIndex && mainIndex)
if ((nextIndex = $X(nextIndex, doc))) {
var parent = mainIndex.parentNode;
var newIndex = document.importNode(nextIndex, true);
parent.replaceChild(newIndex, mainIndex);
}
// append the injected nodes at the end of this page (discounting the index)
var nextItemsPath = getMetainfo("meta", "items-xpath", doc) || mainItemsPath;
var nextItems = $x(nextItemsPath, doc);
var target = last.parentNode;
appendTo(nextItems, target, parent == target ? newIndex : null);
last = getLastItem(mainItemsPath, doc);
next = getMetainfo("link", "next", doc);
listen();
}
function appendTo(nodes, target, notafter) {
var nodes = [].slice.call(nodes);
var doc = target.ownerDocument;
while (nodes.length)
target.insertBefore(doc.importNode(nodes.shift(), true), notafter);
}