a code snip

import java.util.ArrayList;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;

import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.creditcloud.brick.task.CrawlTask;
import com.creditcloud.brick.task.Extractor;
import com.creditcloud.brick.task.Field;

@NoArgsConstructor
@AllArgsConstructor
@Data
@Slf4j
public class DataCrawler {
    String                 url;
    String                 space;
    
    public HashMap<String, Object> doCrawl(CrawlTask task ) {
        HashMap<String, Object> result = new HashMap<String, Object>();
        this.url = task.getUrl();
        this.space = task.getId();
        //
        String content=this.doGet(url);
        if( StringUtils.isNotEmpty(content)) {
            Extractor actor=task.getExtractor();
            if( actor != null )
                this.parse(actor, content, result);
        }
        return result;
    }
    
    public String doGet( String url ) {
        String data=null;
        //return Jsoup.connect(url).userAgent("Mozilla").get();
        CloseableHttpClient httpclient = HttpClients.createDefault();
        HttpGet hGet = new HttpGet(url);
        log.info(url);
        CloseableHttpResponse response = null;
        try {
            response = httpclient.execute(hGet);
            HttpEntity entity = response.getEntity();    
            System.out.println(response.getStatusLine());
            log.info( response.getStatusLine().toString() );
            //
            if (entity != null) {  
                System.out.println("Response content length: " + entity.getContentLength());
                data = EntityUtils.toString(entity);
                System.out.println(data);  
                EntityUtils.consume(entity);
                
            }
        }
        catch(Exception e){
            log.error(e.getMessage());
        }
        //response
        try 
        {  
            if( response != null )
                response.close();  
        }
        catch(Exception e) {
            log.error(e.getMessage());
        }
        //httpclient
        try {
            if( httpclient != null )
                httpclient.close();
        } catch (Exception e) {
            log.error(e.getMessage());
        }

        return data;
    }
    
    public String removeHtmlLabel( String input ) {
        return input.replaceAll("<[^>]+>", "").replaceAll("&nbsp;"," ").trim();
    }
    
    //
    public ArrayList<String> match( Extractor extractor, String input ) {
        ArrayList<String> result = new ArrayList<String>();
        switch (extractor.getType()) {
        case css:    //call css
        {
            Document doc = Jsoup.parse(input);
            Elements elems = doc.select(extractor.getPattern());
            for( Element elem:elems ) {
                result.add( elem.toString() );
            }
        }
        break;
        case regex: //call regex
        {
            Pattern p = Pattern.compile( extractor.getPattern());
            Matcher m = p.matcher( input );
            String matchValue = null;
            while(m.find()) {
                matchValue = StringEscapeUtils.unescapeHtml4( m.group());
                result.add(matchValue);
            }
        }
        break;
        case empty:
            result.add(input);
            break;
        }
        return result;
    }
    
    public void parse( Extractor extractor, String input, HashMap<String, Object> result ) {
        //1. match by css or regex
        ArrayList<String> strlist = this.match(extractor, input);
        if( strlist.isEmpty() ) {
            //result.put( extractor.getId(), null);
            return;
        }
        //2. call children extractors
        switch(extractor.getData()) {
        case array:{
            //result.setType(ResultDataType.array);
            ArrayList<HashMap<String, Object>> list = new ArrayList<HashMap<String, Object>>();
            for( String str:strlist ) {
                HashMap<String, Object> childResult = new HashMap<String, Object>();
                for( Extractor one:extractor.getChildren()) {
                    this.parse(one, str, childResult);
                }
                if( childResult.isEmpty() == false )
                    list.add(childResult);
            }
            if(list.isEmpty() == false )
                result.put( extractor.getId(), list );
        }
        break;
        case field:{
            for(Field fd:extractor.getFields()) {
                String val=strlist.get( fd.getIndex() );
                result.put( fd.getName(), this.removeHtmlLabel(val) );
            }
        }
        break;
        case none: {
            for( String str:strlist ) {
                for( Extractor one:extractor.getChildren()) {
                    this.parse(one, str, result);
                }
            }
        }
        break;
        }
    }
}

 

posted @ 2015-05-15 11:16  samu  阅读(216)  评论(0编辑  收藏  举报