YAAS:YY的网页爬虫工具箱

By Dexter.Yy Last update Sep 26, 2009 — Installed 112 times.

There are 5 previous versions of this script.

// ==UserScript==
// @name           YAAS
// @namespace      limboy.com
// @description    Yet Another All-in-one Spider
// @include        *
// ==/UserScript==


(function(){
	if( window != top )
		return false;
		
	var TUI = {
		isFunction: function(obj) {
			return Object.prototype.toString.call(obj) === "[object Function]";
		},
		each: function(obj, fn){
			if( obj.constructor == Array ) {
				for(var i=0, l=obj.length;i<l;i++){
					var re = fn.call(obj[i], i+1);
					if( re === false )
						break;		
				}
			} else {
				for(var i in obj){
					var re = fn.call(obj[i], obj[i], i);
					if( re === false )
						break;		
				}
			}
		},
		elm: function(did) {
			return document.getElementById(did);
		},
		isCloseTag: function(tagName){
			return ({area:1,base:1,col:1,hr:1,img:1,br:1,input:1,link:1,meta:1,param:1})[tagName.toLowerCase()];
		},
		addElm: function(tag, attr, cb) {
			var temp = document.createElement("div");
			if( /</.test(tag) ){
				temp.innerHTML = tag;
				cb = attr;
			}else{
				var attrhtml = [''], elm = '<'+tag; 
				attr = attr || {}; 
				TUI.each(attr, function(n){
					if( n == "innerHTML" )
						return true;
					attrhtml.push( ( n == "className" && 'class="' || ( n + '="' ) ) + this + '"');
				});
				temp.innerHTML = '<' + tag + attrhtml.join(' ') 
								+ ( TUI.isCloseTag(tag) ? ' />' : ( '>' + ( attr.innerHTML || '' ) + '</' + tag + '>' ) );
			}
			var elm = temp.firstChild;
			if(cb) 
				cb.call(elm);
			delete temp;
			return elm;
		},
		insertStyle: function(css) {
			var styleElm = document.createElement("style");
			document.getElementsByTagName("head")[0].appendChild(styleElm)
			var cssText = document.createTextNode(css);
			styleElm.appendChild(cssText);
		},
		ajax: function(op) {
			var XHR = new XMLHttpRequest();
		    XHR.open( op.type || 'GET', op.url, true, op.username, op.password );
		    XHR.onreadystatechange = function() {
		        if (XHR.readyState == 4)
		            op.success(XHR.responseText);
		    };
		    XHR.send(op.data || '');
			return XHR;
		},
		httpParam: function(a) {
			var s = [];
		    if ( a.constructor == Array ) {
		        for ( var i = 0; i < a.length; i++ )
		            s.push( a[i].name + "=" + encodeURIComponent( a[i].value ) );
		    } else {
		        for ( var j in a )
		            s.push( j + "=" + encodeURIComponent( a[j] ) );
		    }
		    return s.join("&").replace(/%20/g, "+");
		},
		request: function(type, url, data, cb, dataType) {
			if( this.isFunction(data) ) {
				cb = data;
				data = '';
			} else if( typeof data != "string" ) {
				data = this.httpParam(data);
			}
			var userAuth = userAuth || {}
			
			return this.ajax({
				url: url,
				type: type,
				data: data,
				success: cb,
				dataType: dataType,
				username: userAuth.username,
				password: userAuth.password
			});
		},
		get: function(url, data, cb, type) {
			return this.request("GET", url, data, cb, type);
		},
		post: function(url, data, cb, type) {
			this.request("POST", url, data, cb, type);
		}
	};
	
	TUI.pageSpider = function(op){
		var loading = op.loading || function(){};
		var error = op.error || function(){};
		
		return {
			getSimilarLink: function(rule, callback){
				var me = this;
				var pause = [];
				var step = [(new Date()).getTime()];
				var history = { length: 0 };
				var requestPool = { length: 0 };
				
				this.pause = function(fn){
					pause[0] = true;
					if(fn)
						fn( getList(history) );
				};
				
				this.go = function(){
					delete requestPool.length;
					var links = [];
					TUI.each(requestPool, function(){
						links.push(this);
					});
					step[0] = (new Date()).getTime();
					requestPool = { length: 0 };
					pause[0] = false;
					spider(links);
				};
				
				var getList = function(data){
					var links = [];
					TUI.each(data, function(v,n){
						if( n != "length" )
							links.push(n);
					});
					return links;
				};
				
				var spider = function(html) {
					try{
						var a = html;
						if(typeof html == "string")
							a = html.replace(/&amp;/g, "&").match(new RegExp(rule, "g"));
						if (!a)
							return false;
						TUI.each(a, function(){try{
							var v = this;
							var uuid = (new Date()).getTime();
							var visited = history[v];
							if( !visited ) {
								history[v] = true;
								history.length++;
								requestPool[uuid] = v;
								requestPool.length++;
							}
							//console.log(requestPool.length)
							if( history.length != 0 && requestPool.length <= 0 ) {
								callback( getList(history) );
							} else if( !visited ) {
								TUI.get(v, function(html){
									if( uuid < step[0] )
										return false;

									if( !pause[0] ) {
										if( requestPool[uuid] ){
											delete requestPool[uuid];
											requestPool.length--;
										}
										spider(html);
									} else {
										history[v] = false;
										history.length--;
									}
								});
								loading();
							}
						}catch(e){}});
					
					}catch(e){
						me.pause();
						error(e);
					}
					return true;
				};
				
				return spider(document.body.innerHTML);
			}
		}
		
	};
	
	var YYbox = {
		getHTML: function(){
			return <div id="TUI_box">
				<h4><strong style="color:#ff6600">YAAS</strong> - YY的网页爬虫<span><input type="button" id="TUI_boxClose" class="TUI_btn" value="X" /></span></h4>
				<div class="TUI_method">
					<h5>#1: 抓取当前网站中相似的链接地址</h5>
					<p><label for="TUI_linkrule">链接地址:</label><input type="text" value="" id="TUI_linkrule"  /></p>
					<p><em>tips: 可变部分用*代替</em><input type="button" class="TUI_btn" id="TUI_getSimilarLink" value="开始抓取" /></p>
				</div>
				<div id="TUI_result">Loading, please waiting</div>
			</div>;
		},
		addCSS: function() {
			var style = '#TUI_box{text-align:center;font-size:14px;padding:0;width:360px;overflow:hidden;background:#fff;border:2px solid #999;position:fixed;top:10px;left:10px;z-index:19999;-moz-border-radius-topleft:8px;-moz-border-radius-topright:8px;}'
						+ '#TUI_box .TUI_btn{padding:0;width:22px;font-size:14px;line-height:14px;background:none;color:#fff;border:0;cursor:pointer}'
						+ '#TUI_box .TUI_btn:hover{background:#000;color:#fff;}'
						+ '#TUI_box h4{text-align:left;font-weight:500;padding:0 0 0 20px;margin:0;font-size:16px;height:40px;line-height:40px;background:#333;color:#fff;-moz-border-radius-topleft:5px;-moz-border-radius-topright:5px;}'
						+ '#TUI_box h4 span{float:right;margin:-40px 0 0 0;}'
						+ '#TUI_box h4 span .TUI_btn{float:right;margin:10px 8px 0 0;}'
						+ '#TUI_box .TUI_method{margin:10px 20px;text-align:center;}'
						+ '#TUI_box	p{clear:both;line-height:30px;padding:0;margin:0;text-align:left;}'
						+ '#TUI_box	p em{line-height:20px;font-size:12px;float:left;margin:10px 0 10px 0;color:#666}'
						+ '#TUI_box .TUI_method p label{width:80px;height:30px;display:block;float:left;}'
						+ '#TUI_box .TUI_method p input{font-size:14px;width:228px;padding:3px;}'
						+ '#TUI_box .TUI_method h5{font-size:12px;padding:4px 6px;margin:10px 0;background:#eee;}'
						+ '#TUI_box .TUI_method .TUI_btn{background:#333;width:100px;margin:10px 0 10px 5px;float:right;}'
						+ '#TUI_box .TUI_method .TUI_btn:hover{background:#000}'
						+ '#TUI_result{display:none;clear:both;background:#ffffcc;padding:10px;background:#333;color:#fff;text-align:left;}'
						+ '#TUI_result textarea{border:1px solid #666;width:320px;background:#ffffcc;height:200px;padding:5px;background:#}';
			TUI.insertStyle(style);
		},
		show: function(init) {
			var that = this;
			this.addCSS();
			this.box = TUI.addElm(this.getHTML(), function(){ document.body.appendChild(this); });
			TUI.elm("TUI_boxClose").addEventListener('click', function(){
				that.close();
			}, false);
			init.call(this.box);
		},
		close: function(){
			document.body.removeChild(this.box);
		}
	}
	
	var spider = TUI.pageSpider({
		loading: function(){
			TUI.elm("TUI_result").innerHTML += " .";
		},
		error: function(e){
			TUI.elm("TUI_result").innerHTML = '<span style="color:#f00">抓取过程出错,注意匹配规则应该跟页面里的链接地址写法保持一致,尽可能写相对地址("/"开头)</span>';
			TUI.elm("TUI_getSimilarLink").value = "重新抓取";
		}
	});
	
	YYbox.show(function(){
		TUI.elm("TUI_getSimilarLink").addEventListener('click', function(){
			if( this.value == "暂停" ) {
				this.value = "继续抓取";
				clickBtn(1);
			} else if( this.value == "继续抓取" ){
				this.value = "暂停";
				clickBtn(2);
			} else {
				this.value = "暂停";
				clickBtn(0);
			}
		}, false);
	});
	
	function showResult(links) {
		var r = TUI.elm("TUI_result");
		r.innerHTML = '<p>结果:</p><textarea>' + links.join("\n") + '</textarea>'; //console.log(links.length)
		/*
		var text = r.getElementsByTagName("TEXTAREA")[0];
		text.value = links.join("\n");
		text.addEventListener('click', function(){
			this.select();
		}, false);
		*/
	};
	
	function clickBtn(action) {
		var rule = TUI.elm("TUI_linkrule").value;
		if(action == 0) {
			TUI.elm("TUI_result").innerHTML = "Loading, please waiting";
			rule = rule.replace(/[\^\$\.\+\?\=\!\:\|\\\/\(\)\[\]\{\}]/g, "\\$&").replace(/\*/g, "[^&\\/\"'<>]+");
			TUI.elm("TUI_result").style.display = "block";
			result = spider.getSimilarLink(rule, function(data){
				showResult(data);
				TUI.elm("TUI_getSimilarLink").value = "重新抓取";
			});
			if (!result) {
				TUI.elm("TUI_result").innerHTML = '<span style="color:#fff">无结果,试试改一下匹配规则,尽可能用相对地址("/"开头)</span>';
				TUI.elm("TUI_getSimilarLink").value = "重新抓取";
			}
		} else if(action == 1) {
			spider.pause(function(data){
				showResult(data);
			});
		} else if(action == 2) {
			TUI.elm("TUI_result").innerHTML = "Loading, please waiting";
			spider.go();
		}
	};
	
})();