15 points

Parsing Response HTML

Last update Sep 24, 2009

If you are parsing XML using the DOMParser is very easy and straight forward.

GM_xmlhttpRequest({
      method: 'GET',
      url: url,
      onload: function(responseDetails) {
        var dp = new XPCNativeWrapper(window, "DOMParser()");
        var parser = new dp.DOMParser();
        responseXML = parser.parseFromString(self.responseText, 'text/xml');     
    }
  });

However, this code only works because XML is strictly formatted. But if you want to parse an HTML page so that you can use XPath on it using this code will likely give you a "XML not well-formatted" error. But, luckily I have found a way around this error. Create a new document and fill it with the response HTML:

function getDOC(url, callback) {
    GM_xmlhttpRequest({
        method: 'GET',
        url: url,
        onload: function (responseDetails) {
            var doc = document.implementation.createDocument('', '', null),
                html = document.createElement('html'),
                head = document.createElement('head'),
                body = document.createElement('body');
            head.innerHTML = /<\s*head[^>]*>((?:.|\s)+?)<\s*\/head\s*>/mi.exec(responseDetails.responseText)[1];
            body.innerHTML = /<\s*body[^>]*>((?:.|\s)+?)<\s*\/body\s*>/mi.exec(responseDetails.responseText)[1];
            doc.appendChild(html);
            html.appendChild(head);
            html.appendChild(body);
            callback(doc);
        }
    });
}

getDOC('http://example.com/', function(doc) {  alert(doc.documentElement.innerHTML) });

Once you do this you can use evaluate and getElementsByTagName on doc:

getDOC('http://example.com/', function(doc) {
    alert(doc.evaluate('count(.//a)', doc, null, 1, null).numberValue);
    alert(doc.getElementsByTagName('a').length);
  });