tampermonkey,采用js解析自定义脚本,实现网页列表数据采集分析

最近一直在做数据采集的事情,目的是使用java开发一套分析指定采集规则,模拟用户动作做数据提取。
因此定义了一套动作脚本,open,click,get,list,opentab,closetab。。。
java解析脚本,调用phantomjs做数据提取,生成数据json文件,对外提供数据接口。
采集引擎终于写的差不多了,虽然还有很多问题需要修改,但是终于不用加班了,嘿嘿嘿。-------jstarseven

码字挺累的,转载请注明出处:http://www.cnblogs.com/jstarseven/p/6278197.html
言归正传,由于一直搞这些东西,突然想着拿js去写个采集玩一玩,就用tampermonkey,毕竟好久没玩了。

简介:针对一些网站的数据列表,定义采集脚本,模拟用户操作,做列表数据提取,生成json数据格式化展示。

json采集脚本定义:

 1 {
 2     "type": "list",
 3     "selector": "",//列表选择器
 4     "max_page": 1,//采集页数
 5     "page_selector": "",//翻页选择器
 6     "iframe_selector": "",//iframe 选择器
 7     "datas": [//采集字段定义
 8         {
 9             "selector": " ",//字段选择器<此处为针对列表的子选择器>
10             "column": "title",//字段名称
11             "from": "text",//采集类型
12             "iframe_selector": "",//iframe选择器 防止一些网站怪异 一般不需要
13             "open_tab": [//当前字段开新标签做采集
14                 {
15                     "selector": " ",//新标签字段选择器
16                     "column": " ",
17                     "from": "text",
18                     "iframe_selector": ""
19                 },
20                 {
21                     "selector": " ",
22                     "column": " ",
23                     "from": "text",
24                     "iframe_selector": ""
25                 },
26                 {
27                     "selector": " ",
28                     "column": " ",
29                     "from": "text",
30                     "iframe_selector": ""
31                 }
32             ]
33         },
34         {
35             "selector": " ",//字段选择器
36             "column": " ",
37             "from": "text",
38             "iframe_selector": ""
39         },
40         {
41             "selector": " ",//字段选择器
42             "column": " ",
43             "from": "text",
44             "iframe_selector": ""
45         }
46     ]
47 }

 

脚本定义好了,剩下的就是写js代码解析脚本,做数据采集,数据合并了。
那么怎么去解析实现呢,针对新开标签页的数据采集,怎么样要和之前的列表项数据做合并,保证数据的完整性呢?
1.因为数据需要做存储,首先想到这么多数据该怎么存储呢,首先想到sessionStorage,但是sessionStorage在我新开标签页的时候数据不能共享,
那么就用localStorage,localStorage一般上限5m左右,足以存储一般列表的十几页数据。
2.详情页面的数据和列表项数据合并,既然上面说到localStorage,那么就在localStorage里面放入一个指定的map,存放列表数据
针对列表的每一项做一个key,然后再新开标签的时候传递key,提取详情的数据,将详情页面数据,放入map中指定key的数据中。

js实现map方便数据存储:

 

  1 /*   
  2  * MAP对象,实现MAP功能   
  3  *   
  4  * 接口:   
  5  * size()     获取MAP元素个数   
  6  * isEmpty()    判断MAP是否为空   
  7  * clear()     删除MAP所有元素   
  8  * put(key, value)   向MAP中增加元素(key, value)    
  9  * remove(key)    删除指定KEY的元素,成功返回True,失败返回False   
 10  * get(key)    获取指定KEY的元素值VALUE,失败返回NULL   
 11  * element(index)   获取指定索引的元素(使用element.key,element.value获取KEY和VALUE),失败返回NULL   
 12  * containsKey(key)  判断MAP中是否含有指定KEY的元素   
 13  * containsValue(value) 判断MAP中是否含有指定VALUE的元素   
 14  * values()    获取MAP中所有VALUE的数组(ARRAY)   
 15  * keys()     获取MAP中所有KEY的数组(ARRAY)   
 16  */
 17 function Map() {
 18     this.elements = [];
 19 
 20     //获取MAP元素个数     
 21     this.size = function () {
 22         return this.elements.length;
 23     };
 24 
 25     //判断MAP是否为空     
 26     this.isEmpty = function () {
 27         return (this.elements.length < 1);
 28     };
 29 
 30     //删除MAP所有元素     
 31     this.clear = function () {
 32         this.elements = [];
 33     };
 34 
 35     //向MAP中增加元素(key, value)      
 36     this.put = function (_key, _value) {
 37         for (var i = 0; i < this.elements.length; i++) {
 38             if (this.elements[i].key == _key) {
 39                 this.elements[i].value = _value;
 40                 return;
 41             }
 42         }
 43         this.elements.push({
 44             key: _key,
 45             value: _value
 46         });
 47     };
 48 
 49     //删除指定KEY的元素,成功返回True,失败返回False     
 50     this.remove = function (_key) {
 51         var bln = false;
 52         try {
 53             for (var i = 0; i < this.elements.length; i++) {
 54                 if (this.elements[i].key == _key) {
 55                     this.elements.splice(i, 1);
 56                     return true;
 57                 }
 58             }
 59         } catch (e) {
 60             bln = false;
 61         }
 62         return bln;
 63     };
 64 
 65     //获取指定KEY的元素值VALUE,失败返回NULL     
 66     this.get = function (_key) {
 67         try {
 68             for (var i = 0; i < this.elements.length; i++) {
 69                 if (this.elements[i].key == _key) {
 70                     return this.elements[i].value;
 71                 }
 72             }
 73         } catch (e) {
 74             return null;
 75         }
 76     };
 77 
 78     //获取指定索引的元素(使用element.key,element.value获取KEY和VALUE),失败返回NULL     
 79     this.element = function (_index) {
 80         if (_index < 0 || _index >= this.elements.length) {
 81             return null;
 82         }
 83         return this.elements[_index];
 84     };
 85 
 86     //判断MAP中是否含有指定KEY的元素     
 87     this.containsKey = function (_key) {
 88         var bln = false;
 89         try {
 90             for (var i = 0; i < this.elements.length; i++) {
 91                 if (this.elements[i].key == _key) {
 92                     bln = true;
 93                 }
 94             }
 95         } catch (e) {
 96             bln = false;
 97         }
 98         return bln;
 99     };
100 
101     //判断MAP中是否含有指定VALUE的元素     
102     this.containsValue = function (_value) {
103         var bln = false;
104         try {
105             for (var i = 0; i < this.elements.length; i++) {
106                 if (this.elements[i].value == _value) {
107                     bln = true;
108                 }
109             }
110         } catch (e) {
111             bln = false;
112         }
113         return bln;
114     };
115 
116     //获取MAP中所有VALUE的数组(ARRAY)     
117     this.values = function () {
118         var arr = [];
119         for (var i = 0; i < this.elements.length; i++) {
120             arr.push(this.elements[i].value);
121         }
122         return arr;
123     };
124 
125     //获取MAP中所有KEY的数组(ARRAY)     
126     this.keys = function () {
127         var arr = [];
128         for (var i = 0; i < this.elements.length; i++) {
129             arr.push(this.elements[i].key);
130         }
131         return arr;
132     };
133 }

 

js实现操作localStorage:

 

 1 /**
 2  *获取当前任务配置信息
 3  */
 4 function getTaskDataMap() {
 5     var data_maps = localStorage.getItem("data_maps");
 6     var datas = new Map();
 7     if (isNullParam(data_maps)) {
 8         data_maps = datas;
 9     } else {
10         datas.elements = JSON.parse(data_maps).elements;
11         return datas;
12     }
13     return data_maps;
14 }
15 
16 /**
17  *清空当前任务配置信息
18  */
19 function clearTaskDataMap() {
20     localStorage.setItem("data_maps", "");
21 }
22 
23 /**
24  * 当前任务添加配置信息
25  * @param step_id  脚本步骤id
26  * @param config   [doms,json]
27  */
28 function addTaskDataMap(key, values) {
29     if (isNullParam(key) || isNullParam(values))
30         return;
31     var data_maps = getTaskDataMap();
32     data_maps.put(key, values);
33     localStorage.setItem("data_maps", JSON.stringify(data_maps));
34 }

 

采用jquery.simulate.js实现点击

 

  1 /*!
  2  * jQuery Simulate v@VERSION - simulate browser mouse and keyboard events
  3  * https://github.com/jquery/jquery-simulate
  4  *
  5  * Copyright jQuery Foundation and other contributors
  6  * Released under the MIT license.
  7  * http://jquery.org/license
  8  *
  9  * Date: @DATE
 10  */
 11 
 12 ;(function ($, undefined) {
 13 
 14     var rkeyEvent = /^key/,
 15         rmouseEvent = /^(?:mouse|contextmenu)|click/;
 16 
 17     $.fn.simulate = function (type, options) {
 18         return this.each(function () {
 19             new $.simulate(this, type, options);
 20         });
 21     };
 22 
 23     $.simulate = function (elem, type, options) {
 24         var method = $.camelCase("simulate-" + type);
 25 
 26         this.target = elem;
 27         this.options = options;
 28 
 29         if (this[method]) {
 30             this[method]();
 31         } else {
 32             this.simulateEvent(elem, type, options);
 33         }
 34     };
 35 
 36     $.extend($.simulate, {
 37 
 38         keyCode: {
 39             BACKSPACE: 8,
 40             COMMA: 188,
 41             DELETE: 46,
 42             DOWN: 40,
 43             END: 35,
 44             ENTER: 13,
 45             ESCAPE: 27,
 46             HOME: 36,
 47             LEFT: 37,
 48             NUMPAD_ADD: 107,
 49             NUMPAD_DECIMAL: 110,
 50             NUMPAD_DIVIDE: 111,
 51             NUMPAD_ENTER: 108,
 52             NUMPAD_MULTIPLY: 106,
 53             NUMPAD_SUBTRACT: 109,
 54             PAGE_DOWN: 34,
 55             PAGE_UP: 33,
 56             PERIOD: 190,
 57             RIGHT: 39,
 58             SPACE: 32,
 59             TAB: 9,
 60             UP: 38
 61         },
 62 
 63         buttonCode: {
 64             LEFT: 0,
 65             MIDDLE: 1,
 66             RIGHT: 2
 67         }
 68     });
 69 
 70     $.extend($.simulate.prototype, {
 71 
 72         simulateEvent: function (elem, type, options) {
 73             var event = this.createEvent(type, options);
 74             this.dispatchEvent(elem, type, event, options);
 75         },
 76 
 77         createEvent: function (type, options) {
 78             if (rkeyEvent.test(type)) {
 79                 return this.keyEvent(type, options);
 80             }
 81 
 82             if (rmouseEvent.test(type)) {
 83                 return this.mouseEvent(type, options);
 84             }
 85         },
 86 
 87         mouseEvent: function (type, options) {
 88             var event, eventDoc, doc, body;
 89             options = $.extend({
 90                 bubbles: true,
 91                 cancelable: (type !== "mousemove"),
 92                 view: window,
 93                 detail: 0,
 94                 screenX: 0,
 95                 screenY: 0,
 96                 clientX: 1,
 97                 clientY: 1,
 98                 ctrlKey: false,
 99                 altKey: false,
100                 shiftKey: false,
101                 metaKey: false,
102                 button: 0,
103                 relatedTarget: undefined
104             }, options);
105 
106             if (document.createEvent) {
107                 event = document.createEvent("MouseEvents");
108                 event.initMouseEvent(type, options.bubbles, options.cancelable,
109                     options.view, options.detail,
110                     options.screenX, options.screenY, options.clientX, options.clientY,
111                     options.ctrlKey, options.altKey, options.shiftKey, options.metaKey,
112                     options.button, options.relatedTarget || document.body.parentNode);
113 
114                 // IE 9+ creates events with pageX and pageY set to 0.
115                 // Trying to modify the properties throws an error,
116                 // so we define getters to return the correct values.
117                 if (event.pageX === 0 && event.pageY === 0 && Object.defineProperty) {
118                     eventDoc = event.relatedTarget.ownerDocument || document;
119                     doc = eventDoc.documentElement;
120                     body = eventDoc.body;
121 
122                     Object.defineProperty(event, "pageX", {
123                         get: function () {
124                             return options.clientX +
125                                 ( doc && doc.scrollLeft || body && body.scrollLeft || 0 ) -
126                                 ( doc && doc.clientLeft || body && body.clientLeft || 0 );
127                         }
128                     });
129                     Object.defineProperty(event, "pageY", {
130                         get: function () {
131                             return options.clientY +
132                                 ( doc && doc.scrollTop || body && body.scrollTop || 0 ) -
133                                 ( doc && doc.clientTop || body && body.clientTop || 0 );
134                         }
135                     });
136                 }
137             } else if (document.createEventObject) {
138                 event = document.createEventObject();
139                 $.extend(event, options);
140                 // standards event.button uses constants defined here: http://msdn.microsoft.com/en-us/library/ie/ff974877(v=vs.85).aspx
141                 // old IE event.button uses constants defined here: http://msdn.microsoft.com/en-us/library/ie/ms533544(v=vs.85).aspx
142                 // so we actually need to map the standard back to oldIE
143                 event.button = {
144                         0: 1,
145                         1: 4,
146                         2: 2
147                     }[event.button] || ( event.button === -1 ? 0 : event.button );
148             }
149 
150             return event;
151         },
152 
153         keyEvent: function (type, options) {
154             var event;
155             options = $.extend({
156                 bubbles: true,
157                 cancelable: true,
158                 view: window,
159                 ctrlKey: false,
160                 altKey: false,
161                 shiftKey: false,
162                 metaKey: false,
163                 keyCode: 0,
164                 charCode: undefined
165             }, options);
166 
167             if (document.createEvent) {
168                 try {
169                     event = document.createEvent("KeyEvents");
170                     event.initKeyEvent(type, options.bubbles, options.cancelable, options.view,
171                         options.ctrlKey, options.altKey, options.shiftKey, options.metaKey,
172                         options.keyCode, options.charCode);
173                     // initKeyEvent throws an exception in WebKit
174                     // see: http://stackoverflow.com/questions/6406784/initkeyevent-keypress-only-works-in-firefox-need-a-cross-browser-solution
175                     // and also https://bugs.webkit.org/show_bug.cgi?id=13368
176                     // fall back to a generic event until we decide to implement initKeyboardEvent
177                 } catch (err) {
178                     event = document.createEvent("Events");
179                     event.initEvent(type, options.bubbles, options.cancelable);
180                     $.extend(event, {
181                         view: options.view,
182                         ctrlKey: options.ctrlKey,
183                         altKey: options.altKey,
184                         shiftKey: options.shiftKey,
185                         metaKey: options.metaKey,
186                         keyCode: options.keyCode,
187                         charCode: options.charCode
188                     });
189                 }
190             } else if (document.createEventObject) {
191                 event = document.createEventObject();
192                 $.extend(event, options);
193             }
194 
195             if (!!/msie [\w.]+/.exec(navigator.userAgent.toLowerCase()) || (({}).toString.call(window.opera) === "[object Opera]")) {
196                 event.keyCode = (options.charCode > 0) ? options.charCode : options.keyCode;
197                 event.charCode = undefined;
198             }
199 
200             return event;
201         },
202 
203         dispatchEvent: function (elem, type, event) {
204             if (elem.dispatchEvent) {
205                 elem.dispatchEvent(event);
206             } else if (type === "click" && elem.click && elem.nodeName.toLowerCase() === "input") {
207                 elem.click();
208             } else if (elem.fireEvent) {
209                 elem.fireEvent("on" + type, event);
210             }
211         },
212 
213         simulateFocus: function () {
214             var focusinEvent,
215                 triggered = false,
216                 element = $(this.target);
217 
218             function trigger() {
219                 triggered = true;
220             }
221 
222             element.bind("focus", trigger);
223             element[0].focus();
224 
225             if (!triggered) {
226                 focusinEvent = $.Event("focusin");
227                 focusinEvent.preventDefault();
228                 element.trigger(focusinEvent);
229                 element.triggerHandler("focus");
230             }
231             element.unbind("focus", trigger);
232         },
233 
234         simulateBlur: function () {
235             var focusoutEvent,
236                 triggered = false,
237                 element = $(this.target);
238 
239             function trigger() {
240                 triggered = true;
241             }
242 
243             element.bind("blur", trigger);
244             element[0].blur();
245 
246             // blur events are async in IE
247             setTimeout(function () {
248                 // IE won't let the blur occur if the window is inactive
249                 if (element[0].ownerDocument.activeElement === element[0]) {
250                     element[0].ownerDocument.body.focus();
251                 }
252 
253                 // Firefox won't trigger events if the window is inactive
254                 // IE doesn't trigger events if we had to manually focus the body
255                 if (!triggered) {
256                     focusoutEvent = $.Event("focusout");
257                     focusoutEvent.preventDefault();
258                     element.trigger(focusoutEvent);
259                     element.triggerHandler("blur");
260                 }
261                 element.unbind("blur", trigger);
262             }, 1);
263         }
264     });
265 
266 
267     /** complex events **/
268 
269     function findCenter(elem) {
270         var offset,
271             document = $(elem.ownerDocument);
272         elem = $(elem);
273         offset = elem.offset();
274 
275         return {
276             x: offset.left + elem.outerWidth() / 2 - document.scrollLeft(),
277             y: offset.top + elem.outerHeight() / 2 - document.scrollTop()
278         };
279     }
280 
281     function findCorner(elem) {
282         var offset,
283             document = $(elem.ownerDocument);
284         elem = $(elem);
285         offset = elem.offset();
286 
287         return {
288             x: offset.left - document.scrollLeft(),
289             y: offset.top - document.scrollTop()
290         };
291     }
292 
293     $.extend($.simulate.prototype, {
294         simulateDrag: function () {
295             var i = 0,
296                 target = this.target,
297                 eventDoc = target.ownerDocument,
298                 options = this.options,
299                 center = options.handle === "corner" ? findCorner(target) : findCenter(target),
300                 x = Math.floor(center.x),
301                 y = Math.floor(center.y),
302                 coord = {clientX: x, clientY: y},
303                 dx = options.dx || ( options.x !== undefined ? options.x - x : 0 ),
304                 dy = options.dy || ( options.y !== undefined ? options.y - y : 0 ),
305                 moves = options.moves || 3;
306 
307             this.simulateEvent(target, "mousedown", coord);
308 
309             for (; i < moves; i++) {
310                 x += dx / moves;
311                 y += dy / moves;
312 
313                 coord = {
314                     clientX: Math.round(x),
315                     clientY: Math.round(y)
316                 };
317 
318                 this.simulateEvent(eventDoc, "mousemove", coord);
319             }
320 
321             if ($.contains(eventDoc, target)) {
322                 this.simulateEvent(target, "mouseup", coord);
323                 this.simulateEvent(target, "click", coord);
324             } else {
325                 this.simulateEvent(eventDoc, "mouseup", coord);
326             }
327         }
328     });
329 
330 })(jQuery);
View Code

 

格式化json数据,高亮显示

 

 1 /**
 2  * 格式化json
 3  * @param json
 4  * @returns {string|XML}
 5  */
 6 function jsonSyntaxHighLight(json) {
 7     if (typeof json != 'string')
 8         json = JSON.stringify(json, undefined, 2);
 9     json = json.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>');
10     return json.replace(/("(\\u[a-zA-Z0-9]{4}|\\[^u]|[^\\"])*"(\s*:)?|\b(true|false|null)\b|-?\d+(?:\.\d*)?(?:[eE][+\-]?\d+)?)/g, function (match) {
11         var cls = 'number';
12         if (/^"/.test(match)) {
13             if (/:$/.test(match)) {
14                 cls = 'key';
15             } else {
16                 cls = 'string';
17             }
18         } else if (/true|false/.test(match)) {
19             cls = 'boolean';
20         } else if (/null/.test(match)) {
21             cls = 'null';
22         }
23         return '<span class="' + cls + '">' + match + '</span>';
24     });
25 }

 

操作:
(以懒财网公告为例,测试)目前已经测试懒财,cnblog。。。
1.首先安装tampermonkey插件下载地址: http://tampermonkey.net/
2.新建脚本,复制web-extract-list.js 内容粘贴 ctrl+s
3.新建脚本,复制web-extract-detail.js 内容粘贴 ctrl+s
4.打开https://www.lancai.cn/about/notice.html 看执行效果

采集结束之后,json页面:

注意:根据采集的网站不同需要变更js文件里面的// @match 处匹配的url, 以及task_json的脚本配置信息


项目代码github地址:https://github.com/jstarseven/web-list-extract

码字挺累的,转载请注明出处:http://www.cnblogs.com/jstarseven/p/6278197.html

  


  -END- 

 

posted @ 2017-01-12 14:54  jstarseven  阅读(7112)  评论(2编辑  收藏  举报