Source for "Unpaginate pagination microformated web pages"

By Johan Sundström
Has 118 other scripts.


// ==UserScript==
// @name           Unpaginate pagination microformated web pages
// @namespace      http://code.google.com/p/ecmanaut/
// @url            http://userscripts.org/scripts/source/23175.user.js
// @description    Unpaginates pages marked-up with the pagination microformat.
// @include        http://*
// ==/UserScript==

// console.info("consume %x", location.href);

// Example usage:
// getMetainfo({ link: ["next"],
//               meta: ["items-xpath", "pagination-container"] })
// yields { next: <url>, "items-xpath": <xpath expr> } for a page equipped
// with a microformat with a /html/head/link[@rel="next" and @href] and a
// /html/head/meta[@name="items-xpath" and @content] tag.
function getMetainfo(specs, doc) {
  function getOne(tag, attr, doc) {
    var node = $X('/html/head/'+ tag +'[@'+ find[tag] +'="' + attr +'"]', doc);
    return node && node[pick[tag]];
  }
  var find = { link: "rel", meta: "name" };
  var pick = { link: "href", meta: "content" };

  if (typeof specs == "string")
    return getOne.apply(this, [].slice.call(arguments));

  var info = {};
  for (var tag in specs) {
    var values = specs[tag];
    for (var i = 0; i < values.length; i++) {
      var attr = values[i];
      var value = getOne(tag, attr, doc);
      if (value)
        info[attr] = value;
    }
  }
  return info;
}

// list nodes matching this expression, optionally relative to the node `root'
function $x( xpath, root ) {
  var doc = root ? root.evaluate ? root : root.ownerDocument : document, next;
  var got = doc.evaluate( xpath, root||doc, null, 0, null ), result = [];
  switch (got.resultType) {
    case got.STRING_TYPE:
      return got.stringValue;
    case got.NUMBER_TYPE:
      return got.numberValue;
    case got.BOOLEAN_TYPE:
      return got.booleanValue;
    default:
      while (next = got.iterateNext())
	result.push( next );
      return result;
  }
}

function $X( xpath, root ) {
  var got = $x( xpath, root );
  return got instanceof Array ? got[0] : got;
}

// Fetches url, turns it into an HTML DOM, and then invokes cb(dom, url, xhr).
// If runGM is set to true and the url is on the same domain as location.href,
// the loaded document will first be processed by all GM scripts thatt apply.
function wget( url, cb/*( dom, url, xhr )*/, runGM ) {
  if (html2dom[url]) // cache hit?
    return html2dom(null, cb, url, null, runGM);
  GM_xmlhttpRequest({ method:'GET', url:url, onload:function( xhr ) {
    if (xhr.responseXML)
      cb( xhr.responseXML, url, xhr );
    else
      html2dom( xhr.responseText, cb, url, xhr, runGM );
  }});
}

function mayCommunicate(url1, url2) {
  function beforePath(url) {
    url = url.match(/^[^:]+:\/*[^\/]+/);
    return url && url[0].toLowerCase();
  }
  return beforePath(url1) == beforePath(url2);
}

// Well-behaved browers (Opera, maybe WebKit) could use this simple function:
// function html2dom( html, cb/*( xml, url, xhr )*/, url, xhr ) {
//   cb( (new DOMParser).parseFromString(html, "text/html"), url, xhr );
// }

// Firefox doesn't implement (new DOMParser).parseFromString(html, "text/html")
// (https://bugzilla.mozilla.org/show_bug.cgi?id=102699), so we need this hack:
function html2dom( html, cb/*( xml, url, xhr )*/, url, xhr, runGM ) {
  function loaded() {
    doc = cached.doc = iframe.contentDocument;
    iframe.removeEventListener("load", loaded, false);
    doc.removeEventListener("DOMContentLoaded", loaded, false);
    var callbacks = cached.onload;
    delete cached.onload;
    //console.log("DOMContentLoaded of %x: cb %x", url, callbacks);
    setTimeout(function() { // avoid racing with GM's DOMContentLoaded callback
      //console.log("Running %x callbacks", url);
      callbacks.forEach(function(cb,i) { cb( doc, url, xhr ); });
    }, 10);
  };

  var cached = html2dom[url]; // cache of all already loaded and rendered DOM:s
  if (cached)
    if (cached.onload)
      return cached.onload.push(cb);
    else
      return cb(cached.doc, cached.xhr, url);

  var iframe = document.createElement("iframe");
  iframe.style.height = iframe.style.width = "0";
  iframe.style.visibility = "hidden";
  iframe.style.position = "absolute";
  document.body.appendChild(iframe);

  iframe.addEventListener("load", loaded, false);
  html2dom[url] = cached = { onload:[cb], xhr:xhr };
  if (runGM && mayCommunicate(url, location.href))
    return iframe.src = url; // load through GM (should be cached due to xhr)

  //console.log("May not communicate / GM scripts unwanted! (%x)", runGM);
  html = html.replace(/[\n\r]+/g, " "). // needed not freeze up(?!)
    replace(/<script.*?<\/script>/ig, ""). // no code execution on injection!
    replace(/<body(\s+[^="']*=("[^"]*"|'[^']*'|[^'"\s]\S*))*\s*onload=("[^"]*"|'[^']*'|[^"']\S*)/ig, "<body$1" );
  iframe.contentWindow.location.href = location.href; // for cross domain issues
  var doc = iframe.contentDocument;
  doc.open("text/html");
  doc.addEventListener("DOMContentLoaded", loaded, false);
  doc.write(html); // this may throw weird errors we can't catch or silence :-|
  doc.close();
}

html2dom.destroy = function() {
  for (var url in html2dom)
    if (html2dom.hasOwnProperty(url)) {
      var cache = html2dom[url];
      cache.doc = cache.onload = cache.xhr = null;
      delete html2dom[url];
    }
};

// functionally belongs to html2dom above (see location.href line for details)
try { // don't run this script recursively on wget() documents on other urls
  if (window.frameElement &&
      window.parent.location.href.replace(/#.*/, "") == location.href)
    return; // console.warn("Avoiding double firing on %x", location.href);
} catch(e) {
  //console.error("Double fire check error: %x", e);
}

window.addEventListener("unload", html2dom.destroy, false);


var mainIndexPath, mainItemsPath, next, last, seen = {};

if (!init()) // if microformat producers have not run yet, retry on load
  addEventListener("load", init, false);

function init() {
  mainIndexPath = getMetainfo("meta", "pagination-container"); // optional
  mainItemsPath = getMetainfo("meta", "items-xpath");
  if (next = getMetainfo("link", "next")) {
    if (mainItemsPath) {
      if (last = getLastItem(mainItemsPath))
        return listen(), true;
      else
        console.log("Unpaginator found no items matching %x", mainItemsPath);
    }
    //else console.warn("Unpaginator found no items-xpath meta tag: aborting.");
  }
  //else console.info("Unpaginator: entountered last page.");
}

function listen() {
  document.addEventListener("scroll", maybeFetch, false);
  //console.info("on");
  maybeFetch();
}

function deafen() {
  document.removeEventListener("scroll", maybeFetch, false);
  //console.warn("off");
}

function getLastItem(xpath) {
  return xpath && $X("("+ xpath +")[last()]");
}

function maybeFetch() {
  //console.info("maybeFetch()");
  if (last && coordsOf(last).y > pageYOffset + innerHeight * 1.5)
    return; // not preload time yet

  deafen(); // only one xhr at a time
  if (!next) { // may happen in the cross-domain case
    var xpath = getMetainfo("meta", "next-xpath");
    var a = xpath && $X(xpath);
    next = a && a.href;
  }
  if (next && !seen[next]) { // fetch (and reregister the unregistered listener)
    seen[next] = true;
    wget(next, inject, true); // true: should try to run GM scripts on it first
  }
}

function coordsOf(node) {
  if (typeof node.offsetLeft == "undefined" && node.parentNode)
    return coordsOf(node.parentNode);
  var x = 0, y = 0;
  do {
    x += node.offsetLeft;
    y += node.offsetTop;
  } while (node = node.offsetParent);
  return { x:x, y:y };
}

function inject(doc, url, xhr) {
  // If there is a pagination container, rewrite current one from the next page
  var mainIndex = $X(mainIndexPath);
  var nextIndex = getMetainfo("meta", "pagination-container", doc) ||
    mainIndexPath; // in the cross-domain case, we may have to guess.
  if (nextIndex && mainIndex)
    if ((nextIndex = $X(nextIndex, doc))) {
      var parent = mainIndex.parentNode;
      var newIndex = document.importNode(nextIndex, true);
      parent.replaceChild(newIndex, mainIndex);
    }

  // append the injected nodes at the end of this page (discounting the index)
  var nextItemsPath = getMetainfo("meta", "items-xpath", doc) || mainItemsPath;
  var nextItems = $x(nextItemsPath, doc);
  var target = last.parentNode;
  appendTo(nextItems, target, parent == target ? newIndex : null);

  last = getLastItem(mainItemsPath, doc);
  next = getMetainfo("link", "next", doc);
  listen();
}

function appendTo(nodes, target, notafter) {
  var nodes = [].slice.call(nodes);
  var doc = target.ownerDocument;
  while (nodes.length)
    target.insertBefore(doc.importNode(nodes.shift(), true), notafter);
}