webMagic的PageProcessor接口实现类知识整理

webMagic的PageProcessor(页处理器,通过此实现类,可能过css,xpath,json来分析网页中内容)接口实现类知识整理

站点配置成功,可获取网站html内容后,

1:PageProcessor.java接口主要两个方法process()和getSite(),

public interface PageProcessor {

    /**
     * process the page, extract urls to fetch, extract the data and store
     * 处理页面,提取url来获取,提取数据并存储

     * @param page page
     */
    public void process(Page page);

    /**
     * get the site settings
     * 得到站点的设置,可设置站采集等待超时时间、等待多久再尝试、尝试次数 等
     * @return site
     * @see Site
     */
    public Site getSite();
}

2:Page.java 类中拥有的方法 列表

fail()
setSkip()//设置skip之后,这个页面的结果不会被Pipeline处理  if (page.getResultItems().get("name")==null){page.setSkip();}
public void putField(String key, Object field) ; //这个结果会最终保存到ResultItems中
getHtml() //得到Page.getHtml() 得到htmml
getJson() //得到结果返回json格式
setHtml()// 放弃使用
public void addTargetRequests(List<String> requests)
public void addTargetRequest(String requestString)
public void addTargetRequests(List<String> requests, long priority)
public void addTargetRequest(Request request) 
 
getUrl() //获取当前页面的url
setUrl
getRequest
setRequest
public ResultItems getResultItems() ; //得到存储结果,多个字段以对象形式展现
getStatusCode
setStatusCode
getRawText
setRawText
getHeaders
setHeaders
isDownloadSuccess
setDownloadSuccess
getBytes
setBytes
getCharset
setCharset
toString

 

Page.java 源码

package us.codecraft.webmagic;

import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.utils.HttpConstant;
import us.codecraft.webmagic.utils.UrlUtils;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

/**
 * Object storing extracted result and urls to fetch.<br>
*  对象存取的结果和获取urls(从分析结果中获取新的url地址 * Not thread safe.<br> * Main method: <br> * {
@link #getUrl()} get url of current page <br> * {@link #getHtml()} get content of current page <br> * {@link #putField(String, Object)} save extracted result <br> * {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}<br> * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch <br> * * @author code4crafter@gmail.com <br> * @see us.codecraft.webmagic.downloader.Downloader * @see us.codecraft.webmagic.processor.PageProcessor * @since 0.1.0 */ public class Page { private Request request; private ResultItems resultItems = new ResultItems(); private Html html; private Json json; private String rawText; private Selectable url; private Map<String,List<String>> headers; private int statusCode = HttpConstant.StatusCode.CODE_200; private boolean downloadSuccess = true; private byte[] bytes; private List<Request> targetRequests = new ArrayList<Request>(); private String charset; public Page() { } public static Page fail(){ Page page = new Page(); page.setDownloadSuccess(false); return page; }    public Page setSkip(boolean skip) { resultItems.setSkip(skip); return this; } /** * store extract results * * @param key key * @param field field */ public void putField(String key, Object field) { resultItems.put(key, field); } /** * get html content of page * * @return html */ public Html getHtml() { if (html == null) { html = new Html(rawText, request.getUrl()); } return html; } /** * get json content of page * * @return json * @since 0.5.0 */ public Json getJson() { if (json == null) { json = new Json(rawText); } return json; } /** * @param html html * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ public void setHtml(Html html) { this.html = html; } public List<Request> getTargetRequests() { return targetRequests; } /** * add urls to fetch * * @param requests requests */ public void addTargetRequests(List<String> requests) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; } s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s)); } } /** * add urls to fetch * * @param requests requests * @param priority priority */ public void addTargetRequests(List<String> requests, long priority) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; } s = UrlUtils.canonicalizeUrl(s, url.toString()); targetRequests.add(new Request(s).setPriority(priority)); } } /** * add url to fetch * * @param requestString requestString */ public void addTargetRequest(String requestString) { if (StringUtils.isBlank(requestString) || requestString.equals("#")) { return; } requestString = UrlUtils.canonicalizeUrl(requestString, url.toString()); targetRequests.add(new Request(requestString)); } /** * add requests to fetch * * @param request request */ public void addTargetRequest(Request request) { targetRequests.add(request); } /** * get url of current page * * @return url of current page */ public Selectable getUrl() { return url; } public void setUrl(Selectable url) { this.url = url; } /** * get request of current page * * @return request */ public Request getRequest() { return request; } public void setRequest(Request request) { this.request = request; this.resultItems.setRequest(request); } public ResultItems getResultItems() { return resultItems; } public int getStatusCode() { return statusCode; } public void setStatusCode(int statusCode) { this.statusCode = statusCode; } public String getRawText() { return rawText; } public Page setRawText(String rawText) { this.rawText = rawText; return this; } public Map<String, List<String>> getHeaders() { return headers; } public void setHeaders(Map<String, List<String>> headers) { this.headers = headers; } public boolean isDownloadSuccess() { return downloadSuccess; } public void setDownloadSuccess(boolean downloadSuccess) { this.downloadSuccess = downloadSuccess; } public byte[] getBytes() { return bytes; } public void setBytes(byte[] bytes) { this.bytes = bytes; } public String getCharset() { return charset; } public void setCharset(String charset) { this.charset = charset; } @Override public String toString() { return "Page{" + "request=" + request + ", resultItems=" + resultItems + ", html=" + html + ", json=" + json + ", rawText='" + rawText + '\'' + ", url=" + url + ", headers=" + headers + ", statusCode=" + statusCode + ", downloadSuccess=" + downloadSuccess + ", targetRequests=" + targetRequests + ", charset='" + charset + '\'' + ", bytes=" + Arrays.toString(bytes) + '}'; } }

 

posted @ 2020-03-16 16:04  码哥之旅  阅读(1057)  评论(0编辑  收藏  举报