htmlunit简单百度搜索,网页解析

package com;

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.List;
import java.util.Timer;
import java.util.TimerTask;

import org.apache.regexp.recompile;
import org.apache.xalan.templates.ElemApplyImport;

import bean.User;

import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlForm;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextInput;

import dao.UserDao;
public class TimingSearch {
    public static TimingSearch instance=new TimingSearch();
    public static TimingSearch getInstance() {
        return instance;
    }
    StringBuffer stringWeb=new StringBuffer("");
    //第一次截取存放的字符串
    StringBuffer opWeb=new StringBuffer("");
    //第二次截取存放的字符串,条数
    StringBuffer endWeb=new StringBuffer("");
    public final User  downHtml(int id,String key) throws FailingHttpStatusCodeException, MalformedURLException, IOException  {
        User list=(User) UserDao.getInstance().getUserObject(id);
        
            final WebClient  webclient=new WebClient();
            webclient.setCssEnabled(false);
            webclient.setJavaScriptEnabled(false);
            final HtmlPage htmlPage=webclient.getPage("http://www.baidu.com");

            final HtmlForm form=htmlPage.getFormByName("f");
            final HtmlSubmitInput  button=form.getInputByValue("百度一下");
            final HtmlTextInput textFileld=form.getInputByName("wd");
            //设置关键字
            textFileld.setValueAttribute(key+"");
            final HtmlPage page2=button.click();
            stringWeb.append(page2.asText().trim());
            int m=stringWeb.length();
            int i=stringWeb.lastIndexOf("百度为您找到相关结果约");
            if(i==-1){
                 i=stringWeb.lastIndexOf("百度为您找到相关结果");
                 if(i==-1){
                     stringWeb.delete(0, m);
                     return list;
                 }
                 else{
//                        //第一次截取
                        opWeb.append(stringWeb.substring(i+10, m));
                        int n=opWeb.length();
                        stringWeb.delete(0, m);
                        int  j=opWeb.indexOf("个");
                        //第二次截取
                        endWeb.append(opWeb.substring(0, j));
                        opWeb.delete(0, n);
                        int end=endWeb.length();
                        String num=endWeb.toString();
                        String url=page2.getUrl().toString();
                        list.setNum(num);
                        list.setUrl(url);
                            UserDao.getInstance().updateUser(num, url, id);
                            endWeb.delete(0, end);
                            return list;
                 }
            }
            else {
                //第一次截取
                opWeb.append(stringWeb.substring(i+11, m));
                int n=opWeb.length();
                stringWeb.delete(0, m);
                int  j=opWeb.indexOf("个");
                //第二次截取
                endWeb.append(opWeb.substring(0, j));
                opWeb.delete(0, n);
                int end=endWeb.length();
                String num=endWeb.toString();
                String url=page2.getUrl().toString();
                list.setNum(num);
                list.setUrl(url);
                    UserDao.getInstance().updateUser(num, url, id);
                    endWeb.delete(0, end);
                    return list;
            }
    }
    public static void main(String[] args)  {
        TimingSearch t=getInstance();
        try {
            t.downHtml(2012002, "就是点击开始的空间");
        } catch (FailingHttpStatusCodeException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
}

 

posted @ 2013-01-08 13:14  黑暗&独舞  阅读(2940)  评论(0编辑  收藏  举报