Java微博搜索关键字采集

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.Random;
import java.util.concurrent.Callable;

import org.apache.http.client.CookieStore;
import org.apache.log4j.Logger;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.util.Cookie;


public class SinaSearchCrawlerCommand implements Callable<Object> {
    private static Logger logger = Logger.getLogger(SinaSearchCrawlerCommand.class);
    private static String word="如家";
    private static String cookiePath="E:\\学习\\微博爬虫\\cookie\\cookie.file";
    private static String outputpath="E:\\学习\\微博爬虫\\";
    //public Object call(){
    public static void main(String[] args){
        try {
            word= java.net.URLEncoder.encode(word, "utf-8");
        } catch (UnsupportedEncodingException e2) {
            // TODO Auto-generated catch block
            e2.printStackTrace();
        }
        WebClient webClient = new WebClient(BrowserVersion.FIREFOX_17);
        webClient.getCookieManager().setCookiesEnabled(true);
        for(int i=1;i<=100;i++){
        System.out.println(cookiePathAppendRandom());
        File file = new File(cookiePathAppendRandom());
        if (file.exists()) {
            FileInputStream fin = null;
            try {
                fin = new FileInputStream(file);
            } catch (FileNotFoundException e1) {
                e1.printStackTrace();
            }
            CookieStore cookieStore = null;
            ObjectInputStream in;
            try {
                in = new ObjectInputStream(fin);
                cookieStore = (CookieStore) in.readObject();
                in.close();
            } catch (IOException e) {
                logger.error(e);
            } catch (ClassNotFoundException e) {
                logger.error(e);
            }

            List<org.apache.http.cookie.Cookie> l = cookieStore.getCookies();
            for (org.apache.http.cookie.Cookie temp : l) {
                Cookie cookie = new Cookie(temp.getDomain(), temp.getName(),
                        temp.getValue(), temp.getPath(), temp.getExpiryDate(),
                        false);
                webClient.getCookieManager().addCookie(cookie);
            }
            /*HtmlPage page = null;
            try {
                page = webClient.getPage("http://weibo.cn/search/?tf=5_012");
            } catch (FailingHttpStatusCodeException e) {
                logger.error(e);
            } catch (MalformedURLException e) {
                logger.error(e);
            } catch (IOException e) {
                logger.error(e);
            }
            HtmlForm form = page.getForms().get(0);
            HtmlSubmitInput button = form.getInputByName("smblog");
            form.getInputByName("keyword").setValueAttribute(word);
            logger.info("search:" + word);
            try {
                page = button.click();
            } catch (IOException e1) {
                logger.error(e1);
            }*/
            
            HtmlPage page = null;
            try {
                //logger.info("execution:"+this);
                page = webClient.getPage("http://weibo.cn/search/mblog?hideSearchFrame=&keyword="+word+"&page="+i);
            } catch (FailingHttpStatusCodeException e) {
                logger.error(e);
            } catch (MalformedURLException e) {
                logger.error(e);
            } catch (IOException e) {
                logger.error(e);
            }

            SimpleDateFormat dayformat = new SimpleDateFormat("yyyyMMdd");
            long start = System.currentTimeMillis();
            start = System.currentTimeMillis();
            String path = null;
            File file2 = null;
            path = new String(outputpath + "/" + dayformat.format(start)
                    + "/" + System.currentTimeMillis() + file.getName()+".html" );
            file2 = new File(outputpath + "/" + dayformat.format(start));
            if (!file2.exists())
                file2.mkdirs();
            file2 = new File(path);
            System.out.println("当前页"+i+",采集至"+path);
            if (file2.exists())
                logger.warn("outfile exit!");
            else {
                FileOutputStream outputStream;
                try {
                    outputStream = new FileOutputStream(file2);
                    outputStream.write(page.getWebResponse().getContentAsString().getBytes());
                    outputStream.close();
                } catch (FileNotFoundException e) {
                    logger.error(e);
                } catch (IOException e) {
                    logger.error(e);
                }
            }
            webClient.closeAllWindows();
        } else {
            logger.warn("CookiePath doesn`t exit !!!");
        }
        
        logger.info("execution:");
        try {
            Thread.sleep(10000);
        } catch (InterruptedException e) {
            logger.error(e);
            return;
        }
        }
        return;
        
    }
    
    private static String cookiePathAppendRandom() {
        Random random = new Random();
        return cookiePath+random.nextInt(7);
    }
    
    public SinaSearchCrawlerCommand(String word, String cookiePath, String outputpath) {
        if(word.contains("&")) {
            word = word.replace("&", " ");
        }
        this.word = word;
        this.cookiePath = cookiePath;
        this.outputpath = outputpath;
    }

    @Override
    public String toString() {
        return "SinaSearchCrawlerCommand [word=" + word + ", outputpath="
                + outputpath + "]";
    }

    @Override
    public Object call() throws Exception {
        // TODO Auto-generated method stub
        return null;
    }


}

 

posted @ 2016-04-09 02:32  陈泽泽  阅读(712)  评论(0编辑  收藏  举报