多线程爬虫

参数：private static int webDepth = 2;//爬虫深度。主页的深度为1，设置深度后超过该深度的网页不会抓取。

private int intThreadNum = 10;//线程数。开启的线程数。

抓取时也会在程序源文件目录下生成一个report.txt文件记录爬虫的运行情况，并在抓取结束后生成一个fileindex.txt文件维护网页文件索引。

本程序用到了多线程(静态变量和同步)，泛型，文件操作，URL类和连接，Hashtable类关联数组，正则表达式及其相关类。运行时需使用命令行参数，第一个参数应使用http://开头的有效URL字符串作为爬虫的主页，第二个参数（可选）应输入可转换为int型的字符串（用Integer.parseInt(String s)静态方法可以转换的字符串，如3）作为爬虫深度，如果没有，则默认深度为2。

本程序的不足之处是：只考虑了href= href=' href="后加绝对url的这三种情况(由于url地址在网页源文件中情况比较复杂，有时处理也会出现错误)，还有相对url和window.open('的情况没有考虑。异常处理程序也只是简单处理。如果读者有改进办法可以把源代码帖出，不胜感激。

附上源代码如下（保存名为GetWeb.java）：

import java.io.File;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.Hashtable;

public class GetWeb {
private int webDepth = 2;               //爬虫深度
private int intThreadNum = 10;          //线程数
private String strHomePage = "";        //主页地址
private String myDomain;                //域名
private String fPath = "web";           //储存网页文件的目录名
private ArrayList<String> arrUrls = new ArrayList<String>();    //存储未处理URL
private ArrayList<String> arrUrl = new ArrayList<String>();     //存储所有URL供建立索引
private Hashtable<String,Integer> allUrls = new Hashtable<String,Integer>();     //存储所有URL的网页号
private Hashtable<String,Integer> deepUrls = new Hashtable<String,Integer>();    //存储所有URL深度
private int intWebIndex = 0;            //网页对应文件下标，从0开始
private String charset = "GB2312";
private String report = "";
private long startTime;
private int webSuccessed = 0;
private int webFailed = 0;

public GetWeb(String s){
this.strHomePage = s;
}

public GetWeb(String s,int i){
this.strHomePage = s;
this.webDepth = i;
}

public synchronized void addWebSuccessed(){
webSuccessed++;
}

public synchronized void addWebFailed(){
webFailed++;
}

public synchronized void addReport(String s){
try{
    report += s;
    PrintWriter pwReport = new PrintWriter(new FileOutputStream("report.txt"));
    pwReport.println(report);
    pwReport.close();
}catch(Exception e){
    System.out.println("生成报告文件失败!");
}
}

public synchronized String getAUrl(){
String tmpAUrl = arrUrls.get(0);
arrUrls.remove(0);
return tmpAUrl;
}

public synchronized String getUrl(){
String tmpUrl = arrUrl.get(0);
arrUrl.remove(0);
return tmpUrl;
}

public synchronized Integer getIntWebIndex(){
intWebIndex++;
return intWebIndex;
}

public static void main(String[] args){
if (args.length == 0 || args[0].equals("")){
   System.out.println("No input!");
   System.exit(1);
}
else if(args.length == 1){
   GetWeb gw = new GetWeb(args[0]);
   gw.getWebByHomePage();
}
else{
   GetWeb gw = new GetWeb(args[0],Integer.parseInt(args[1]));
   gw.getWebByHomePage();
}
}

public void getWebByHomePage(){ //由用户提供的域名站点开始，对所有链接页面进行抓取
startTime = System.currentTimeMillis();
this.myDomain = getDomain();
if (myDomain == null){
   System.out.println("Wrong input!");
   //System.exit(1);
   return;
}

System.out.println("Homepage = " + strHomePage);
addReport("Homepage = " + strHomePage + "!\n");
System.out.println("Domain = " + myDomain);
addReport("Domain = " + myDomain + "!\n");
arrUrls.add(strHomePage);
arrUrl.add(strHomePage);
allUrls.put(strHomePage,0);
deepUrls.put(strHomePage,1);

File fDir = new File(fPath);
if(!fDir.exists()){fDir.mkdir();}

System.out.println("Start!");
this.addReport("Start!\n");
String tmp = getAUrl(); //取出新的URL
this.getWebByUrl(tmp,charset,allUrls.get(tmp)+""); //对新URL所对应的网页进行抓取
int i = 0;
for (i=0;i<intThreadNum;i++){
   new Thread(new Processer(this)).start();
}
while (true){
   if(arrUrls.isEmpty() && Thread.activeCount() == 1){
    long finishTime = System.currentTimeMillis();
    long costTime = finishTime-startTime;
    System.out.println("\n\n\n\n\nFinished!");
    addReport("\n\n\n\n\nFinished!\n");
    System.out.println("Start time = " + startTime + "   " + "Finish time = " + finishTime + "   " + "Cost time = " +

costTime + "ms");
addReport("Start time = " + startTime + " " + "Finish time = " + finishTime + " " + "Cost time = " + costTime + "ms"

+ "\n");
System.out.println("Total url number = " + (webSuccessed+webFailed) + " Successed: " + webSuccessed + " Failed: " +

webFailed);
addReport("Total url number = " + (webSuccessed+webFailed) + " Successed: " + webSuccessed + " Failed: " + webFailed

+ "\n");

    String strIndex = "";
    String tmpUrl = "";
    while (!arrUrl.isEmpty()){
     tmpUrl = getUrl();
     strIndex += "Web depth:" + deepUrls.get(tmpUrl) + "   Filepath: " + fPath + "/web" + allUrls.get(tmpUrl) + ".htm" + "

url:" + tmpUrl + "\n\n";
    }
    System.out.println(strIndex);
    try{
     PrintWriter pwIndex = new PrintWriter(new FileOutputStream("fileindex.txt"));
     pwIndex.println(strIndex);
     pwIndex.close();
    }catch(Exception e){
     System.out.println("生成索引文件失败!");
    }
    break;
   }
}
}

public void getWebByUrl(String strUrl,String charset,String fileIndex)
{    //对后续解析出的url进行抓取
try
{
   //if(charset==null||"".equals(charset))charset="utf-8";
   System.out.println("Getting web by url: " + strUrl);
   addReport("Getting web by url: " + strUrl + "\n");

   URL url = new URL(strUrl);
   URLConnection conn = url.openConnection();
   conn.setDoOutput(true);
   InputStream is = null;
   is = url.openStream();

   String filePath = fPath + "/web" + fileIndex + ".htm";
   PrintWriter pw = null;
   FileOutputStream fos = new FileOutputStream(filePath);
   OutputStreamWriter writer = new OutputStreamWriter(fos);
   pw = new PrintWriter(writer);
   BufferedReader bReader = new BufferedReader(new InputStreamReader(is));
   StringBuffer sb = new StringBuffer();
   String rLine = null;
   String tmp_rLine = null;
   while ((rLine = bReader.readLine()) != null){
    tmp_rLine = rLine;
    int str_len = tmp_rLine.length();
    if (str_len > 0){
     sb.append("\n" + tmp_rLine);
     pw.println(tmp_rLine);
     pw.flush();
     if (deepUrls.get(strUrl) < webDepth)getUrlByString(tmp_rLine,strUrl);
    }
    tmp_rLine = null;
   }
   is.close();
   pw.close();
   System.out.println("Get web successfully! " + strUrl);
   addReport("Get web successfully! " + strUrl + "\n");
   addWebSuccessed();
}catch (Exception e){
   System.out.println("Get web failed!       " + strUrl);
   addReport("Get web failed!       " + strUrl + "\n");
   addWebFailed();
}
}

public String getDomain(){ //判断用户所提供URL是否为域名地址
String reg = "(?<=http\\://[a-zA-Z0-9]{0,100}[.]{0,1})[^.\\s]*?\\.(com|cn|net|org|biz|info|cc|tv)";
Pattern p = Pattern.compile(reg,Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(strHomePage);
boolean blnp = m.find();
if (blnp == true){
return m.group(0);
}
return null;
}

public void getUrlByString(String inputArgs,String strUrl){   //解析新的网页，提取其中含有的链接信息
String tmpStr = inputArgs;
String regUrl = "(?<=(href=)[\"]?[\']?)[http://][^\\s\"\'\\?]*(" + myDomain + ")[^\\s\"\'>]*";
Pattern p = Pattern.compile(regUrl,Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(tmpStr);
boolean blnp = m.find();
//int i = 0;
while (blnp == true){
   if (!allUrls.containsKey(m.group(0))){
    System.out.println("Find a new url,depth:" + (deepUrls.get(strUrl)+1) + " "+ m.group(0));
    addReport("Find a new url,depth:" + (deepUrls.get(strUrl)+1) + " "+ m.group(0) + "\n");
    arrUrls.add(m.group(0));
    arrUrl.add(m.group(0));
    allUrls.put(m.group(0),getIntWebIndex());
    deepUrls.put(m.group(0),(deepUrls.get(strUrl)+1));
   }
   tmpStr = tmpStr.substring(m.end(),tmpStr.length());
   m = p.matcher(tmpStr);
   blnp = m.find();
}
}

class Processer implements Runnable{ //独立的抓取线程
GetWeb gw;
public Processer(GetWeb g){
    this.gw = g;
}
public void run(){
    //Thread.sleep(5000);
    while (!arrUrls.isEmpty()){
      String tmp = getAUrl();
      getWebByUrl(tmp,charset,allUrls.get(tmp)+"");
    }
}
}
}

运行方法：

使用命令行参数，第一个参数是url，注意用http://开头，不能省略。如下：
D:\tmp>javac GetWeb.java

D:\tmp>java GetWeb http://www.baidu.com 3

运行结果：

Homepage = http://www.baidu.com!
Domain = baidu.com!
Start!

Getting web by url: http://www.baidu.com

Find a new url,depth:2 http://passport.baidu.com/?login&tpl=mn

Find a new url,depth:2 http://news.baidu.com

Find a new url,depth:2 http://tieba.baidu.com

Find a new url,depth:2 http://zhidao.baidu.com

Find a new url,depth:2 http://mp3.baidu.com

Find a new url,depth:2 http://image.baidu.com

Find a new url,depth:2 http://video.baidu.com

Find a new url,depth:2 http://hi.baidu.com

Find a new url,depth:2 http://utility.baidu.com/traf/click.php?id=215&url=http://www.baidu.com

Find a new url,depth:2 http://bar.baidu.com/sobar/prom23.html

Find a new url,depth:2 http://e.baidu.com

Find a new url,depth:2 http://top.baidu.com

Find a new url,depth:2 http://ir.baidu.com

Find a new url,depth:2 http://www.baidu.com/duty/

Find a new url,depth:2 http://hi.baidu.com/baidu/

Get web successfully! http://www.baidu.com

Getting web by url: http://passport.baidu.com/?login&tpl=mn

Getting web by url: http://news.baidu.com

Getting web by url: http://tieba.baidu.com

Getting web by url: http://zhidao.baidu.com

Getting web by url: http://mp3.baidu.com

Getting web by url: http://image.baidu.com

Getting web by url: http://video.baidu.com

Getting web by url: http://utility.baidu.com/traf/click.php?id=215&url=http://www.baidu.com

Find a new url,depth:3 http://news.baidu.com/

Find a new url,depth:3 http://www.baidu.com/

Find a new url,depth:3 http://tieba.baidu.com/

Find a new url,depth:3 http://hi.baidu.com/

Find a new url,depth:3 http://tieba.baidu.com/

Find a new url,depth:3 http://zhidao.baidu.com/

Find a new url,depth:3 http://mp3.baidu.com/

Getting web by url: http://hi.baidu.com

Find a new url,depth:3 http://hi.baidu.com/

Find a new url,depth:3 http://image.baidu.com/

Find a new url,depth:3 http://video.baidu.com/

Find a new url,depth:3 https://passport.baidu.com/?getpass

……

超简单的 Web 爬虫程序，不过可以在他基础之上改造一下，写出强大点的爬虫！
谢谢提供程序的 blog 友！
//一直在使用和写PHP spider的爬虫程序。如果用JAVA写的话前台可以加入spider 页面

但是jaxa部分要删去，因PHP不是多线程。下面是网上转载的一个JAVA爬虫小例。很好的学习资料。

/**
* @author Jack.Wang
*
*/
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

// 搜索Web爬行者
public class SearchCrawler implements Runnable {

/*
* disallowListCache缓存robot不允许搜索的URL。 Robot协议在Web站点的根目录下设置一个robots.txt文件,
* 规定站点上的哪些页面是限制搜索的。
* 搜索程序应该在搜索过程中跳过这些区域,下面是robots.txt的一个例子:
* # robots.txt for http://somehost.com/ User-agent:
* Disallow: /cgi-bin/
* Disallow: /registration # Disallow robots on registration page
* Disallow: /login
*/

private HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>();
ArrayList<String> errorList = new ArrayList<String>();// 错误信息
ArrayList<String> result = new ArrayList<String>(); // 搜索到的结果
String startUrl;// 开始搜索的起点
int maxUrl;// 最大处理的url数
String searchString;// 要搜索的字符串(英文)
boolean caseSensitive = false;// 是否区分大小写
boolean limitHost = false;// 是否在限制的主机内搜索

public SearchCrawler(String startUrl, int maxUrl, String searchString) {
   this.startUrl = startUrl;
   this.maxUrl = maxUrl;
   this.searchString = searchString;
}

public ArrayList<String> getResult() {
return result;
}

public void run() {// 启动搜索线程
crawl(startUrl, maxUrl, searchString, limitHost, caseSensitive);
}

// 检测URL格式
private URL verifyUrl(String url) {
   // 只处理HTTP URLs.
   if (!url.toLowerCase().startsWith("http://"))
    return null;
   URL verifiedUrl = null;
   try {
    verifiedUrl = new URL(url);
   } catch (Exception e) {
    return null;
   }
   return verifiedUrl;
}

// 检测robot是否允许访问给出的URL.
private boolean isRobotAllowed(URL urlToCheck) {
String host = urlToCheck.getHost().toLowerCase();// 获取给出RUL的主机
// System.out.println("主机="+host);

// 获取主机不允许搜索的URL缓存
ArrayList<String> disallowList = disallowListCache.get(host);

   // 如果还没有缓存,下载并缓存。
   if (disallowList == null) {
    disallowList = new ArrayList<String>();
    try {
     URL robotsFileUrl = new URL("http://" + host + "/robots.txt");
     BufferedReader reader = new BufferedReader(
       new InputStreamReader(robotsFileUrl.openStream()));

     // 读robot文件，创建不允许访问的路径列表。
     String line;
     while ((line = reader.readLine()) != null) {
      if (line.indexOf("Disallow:") == 0) {// 是否包含"Disallow:"
       String disallowPath = line.substring("Disallow:"
         .length());// 获取不允许访问路径

       // 检查是否有注释。
       int commentIndex = disallowPath.indexOf("#");
       if (commentIndex != -1) {
        disallowPath = disallowPath.substring(0,
          commentIndex);// 去掉注释
       }

       disallowPath = disallowPath.trim();
       disallowList.add(disallowPath);
      }
     }

     // 缓存此主机不允许访问的路径。
     disallowListCache.put(host, disallowList);
    } catch (Exception e) {
     return true; // web站点根目录下没有robots.txt文件,返回真
    }
   }

   String file = urlToCheck.getFile();
   // System.out.println("文件getFile()="+file);
   for (int i = 0; i < disallowList.size(); i++) {
    String disallow = disallowList.get(i);
    if (file.startsWith(disallow)) {
     return false;
    }
   }

return true;
}

private String downloadPage(URL pageUrl) {
   try {
    // Open connection to URL for reading.
    BufferedReader reader = new BufferedReader(new InputStreamReader(
      pageUrl.openStream()));

    // Read page into buffer.
    String line;
    StringBuffer pageBuffer = new StringBuffer();
    while ((line = reader.readLine()) != null) {
     pageBuffer.append(line);
    }

    return pageBuffer.toString();
   } catch (Exception e) {
   }

return null;
}

// 从URL中去掉"www"
private String removeWwwFromUrl(String url) {
   int index = url.indexOf("://www.");
   if (index != -1) {
    return url.substring(0, index + 3) + url.substring(index + 7);
   }

return (url);
}

// 解析页面并找出链接
private ArrayList<String> retrieveLinks(URL pageUrl, String pageContents,
    HashSet crawledList, boolean limitHost) {
   // 用正则表达式编译链接的匹配模式。
   Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]",
     Pattern.CASE_INSENSITIVE);
   Matcher m = p.matcher(pageContents);

   ArrayList<String> linkList = new ArrayList<String>();
   while (m.find()) {
    String link = m.group(1).trim();

    if (link.length() < 1) {
     continue;
    }

    // 跳过链到本页面内链接。
    if (link.charAt(0) == '#') {
     continue;
    }

    if (link.indexOf("mailto:") != -1) {
     continue;
    }

    if (link.toLowerCase().indexOf("javascript") != -1) {
     continue;
    }

    if (link.indexOf("://") == -1) {
     if (link.charAt(0) == '/') {// 处理绝对地
      link = "http://" + pageUrl.getHost() + ":"
        + pageUrl.getPort() + link;
     } else {
      String file = pageUrl.getFile();
      if (file.indexOf('/') == -1) {// 处理相对地址
       link = "http://" + pageUrl.getHost() + ":"
         + pageUrl.getPort() + "/" + link;
      } else {
       String path = file.substring(0,
         file.lastIndexOf('/') + 1);
       link = "http://" + pageUrl.getHost() + ":"
         + pageUrl.getPort() + path + link;
      }
     }
    }

    int index = link.indexOf('#');
    if (index != -1) {
     link = link.substring(0, index);
    }

link = removeWwwFromUrl(link);

    URL verifiedLink = verifyUrl(link);
    if (verifiedLink == null) {
     continue;
    }

    /* 如果限定主机，排除那些不合条件的URL */
    if (limitHost
      && !pageUrl.getHost().toLowerCase().equals(
        verifiedLink.getHost().toLowerCase())) {
     continue;
    }

    // 跳过那些已经处理的链接.
    if (crawledList.contains(link)) {
     continue;
    }

linkList.add(link);
}

return (linkList);
}

// 搜索下载Web页面的内容，判断在该页面内有没有指定的搜索字符串

private boolean searchStringMatches(String pageContents,
    String searchString, boolean caseSensitive) {
   String searchContents = pageContents;
   if (!caseSensitive) {// 如果不区分大小写
    searchContents = pageContents.toLowerCase();
   }

   Pattern p = Pattern.compile("[\\s]+");
   String[] terms = p.split(searchString);
   for (int i = 0; i < terms.length; i++) {
    if (caseSensitive) {
     if (searchContents.indexOf(terms[i]) == -1) {
      return false;
     }
    } else {
     if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
      return false;
     }
    }
   }

return true;
}

// 执行实际的搜索操作
public ArrayList<String> crawl(String startUrl, int maxUrls,
String searchString, boolean limithost, boolean caseSensitive) {

HashSet<String> crawledList = new HashSet<String>();
LinkedHashSet<String> toCrawlList = new LinkedHashSet<String>();

   if (maxUrls < 1) {
    errorList.add("Invalid Max URLs value.");
    System.out.println("Invalid Max URLs value.");
   }

   if (searchString.length() < 1) {
    errorList.add("Missing Search String.");
    System.out.println("Missing search String");
   }

   if (errorList.size() > 0) {
    System.out.println("err!!!");
    return errorList;
   }

// 从开始URL中移出www
startUrl = removeWwwFromUrl(startUrl);

toCrawlList.add(startUrl);
while (toCrawlList.size() > 0) {

    if (maxUrls != -1) {
     if (crawledList.size() == maxUrls) {
      break;
     }
    }

// Get URL at bottom of the list.
String url = toCrawlList.iterator().next();

// Remove URL from the to crawl list.
toCrawlList.remove(url);

// Convert string url to URL object.
URL verifiedUrl = verifyUrl(url);

    // Skip URL if robots are not allowed to access it.
    if (!isRobotAllowed(verifiedUrl)) {
     continue;
    }

    // 增加已处理的URL到crawledList
    crawledList.add(url);
    String pageContents = downloadPage(verifiedUrl);

    if (pageContents != null && pageContents.length() > 0) {
     // 从页面中获取有效的链接
     ArrayList<String> links = retrieveLinks(verifiedUrl,
       pageContents, crawledList, limitHost);

toCrawlList.addAll(links);

     if (searchStringMatches(pageContents, searchString,
       caseSensitive)) {
      result.add(url);
      System.out.println(url);
     }
    }

}
return result;
}

// 主函数
public static void main(String[] args) {
   SearchCrawler crawler = new SearchCrawler("http://www.blogjava.net/Jack2007/", 20,"jack");
   Thread search = new Thread(crawler);
   System.out.println("Start searching...");
   System.out.println("result:");
   search.start();
   try {
    search.join();
   } catch (InterruptedException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
   }
}
}

用了一个PHP采集器之后，我也想写一个JAVA采集器

其实这也并不难，但也不是人们想的那样简单，当然万事只要有心人。

我用过两种方法来做线程调度：一种是用Thread线程，一种是用Timer工作计时器

第一种Thread：

package com.lch.test;

public class Tthread implements Runnable{

/**
* @author leafage Q群:75386881
*/
public static void main(String[] args) {
   Tthread tt=new Tthread();
   Thread th=new Thread(tt);
   th.start();     //起动线程
}

public void run() {
   Tbaidu tb=new Tbaidu();
   for (int j = 0; j < 240; j++) {

     if(j%5==0&&j!=0){ //当等于五时调用另一任务。
      tb.mm();
      try {
       Thread.sleep(4000);
      } catch (InterruptedException e) {
       // TODO 自动生成 catch 块
       e.printStackTrace();
      }//休眠一秒钟再执行
     }
     System.out.println("我自己的方法="+j);
    }
}

}

//+++++++++++===========================其实上面的可写成如下也许会看得更明白

package com.lch.test;

public class ThreadTestWork implements Runnable{

/**
* @author leafage Q群:75386881
*/
private static int KNUM;
public static void main(String[] args) {
   ThreadTestWork   tt=new ThreadTestWork   ();
   Thread th=new Thread(tt);
   th.start();     //起动线程
}

public void run() {

   for (int j = 0; j < 240; j++) {
     if(j%5==0&&j!=0){ //当等于五时调用另一任务。
      mm();
      try {
       Thread.sleep(4000);
      } catch (InterruptedException e) {
       // TODO 自动生成 catch 块
       e.printStackTrace();
      }//休眠一秒钟再执行
     }
     System.out.println("我自己的方法="+j);
    }
}
public static void mm(){

    for (int j=KNUM; j < 34; j++) {
     KNUM++;

      System.out.println("Tbaidu的方法="+j+" -- ");
      if(j%5==0&&j!=0){
       return;//当程序执行五次时，返回上一个线程，这里也相当于暂停了这个程序
      }
      try {
       Thread.sleep(1000);//每隔三3秒钟输出下面的内容。
      } catch (InterruptedException e) {
       e.printStackTrace();
      }
     }
   }

}
输出结果是在两个方法间在指定时间内切换任务。

我自己的方法=0
我自己的方法=1
我自己的方法=2
我自己的方法=3
我自己的方法=4
Tbaidu的方法=0 --
Tbaidu的方法=1 --
Tbaidu的方法=2 --
Tbaidu的方法=3 --
Tbaidu的方法=4 --
Tbaidu的方法=5 --
我自己的方法=5
我自己的方法=6
我自己的方法=7
我自己的方法=8
我自己的方法=9
Tbaidu的方法=6 --

略

///=============================

package com.lch.test;

public class Tthread implements Runnable{

/**
* @author leafage Q群:75386881
*/
public static void main(String[] args) {
   Tthread tt=new Tthread();
   Thread th=new Thread(tt);
   th.start();     //起动线程
}

}

============================================

第二种是用计时器做的

package com.lch.test;

import java.util.Timer;

public class Times extends java.util.TimerTask{

/**
* @param args
*/
private Timer timer = null;

private static Times times = null;
private Timer timer2 = null;

public static void main(String[] args) {
   if (times == null) {
            times = new Times();
            times.start();
        }

}
protected Times() {
        timer = new Timer(false);
    }
protected void start() {

        // 定时提醒：安排指定的任务在指定的延迟后开始进行重复的固定速率执行。
        timer.scheduleAtFixedRate(this, 0,2*1000);


    }
/**
     * 停止定时器
     */
    public static void stop() {
        if (times != null) {
            times.timer.cancel();

        }
    }

@Override
public void run() {
   for (int j = 0; j < 240; j++) {
    try {
     Thread.sleep(1000);
     System.out.println("m="+j);
    } catch (InterruptedException e) {
     e.printStackTrace();
    }
     if(j%5==0&&j!=0){
      timer2 = new Timer(false);
      timer2.scheduleAtFixedRate(new Tbaidu(), 0,2000);    //2秒种之后调用Tbaidu(),
     }

    }
}
}

***但是第二种方法比较占资源，推存用Thread去实现。

符加一个。网上转载的（注上面的可不是转载）java中如何调用CMD命令

java的Runtime.getRuntime().exec(commandStr)可以调用执行cmd指令。

cmd /c dir 是执行完dir命令后关闭命令窗口。

cmd /k dir 是执行完dir命令后不关闭命令窗口。

cmd /c start dir 会打开一个新窗口后执行dir指令，原窗口会关闭。

cmd /k start dir 会打开一个新窗口后执行dir指令，原窗口不会关闭。

可以用cmd /?查看帮助信息。

★CMD命令★
1. gpedit.msc-----组策略
2. sndrec32-------录音机
3. Nslookup-------IP地址侦测器
4. explorer-------打开资源管理器
5. logoff---------注销命令
6. tsshutdn-------60秒倒计时关机命令
7. lusrmgr.msc----本机用户和组
8. services.msc---本地服务设置
9. oobe/msoobe /a----检查XP是否激活
10. notepad--------打开记事本
11. cleanmgr-------垃圾整理
12. net start messenger----开始信使服务
13. compmgmt.msc---计算机管理
14. net stop messenger-----停止信使服务
15. conf-----------启动netmeeting
16. dvdplay--------DVD播放器
17. charmap--------启动字符映射表
18. diskmgmt.msc---磁盘管理实用程序
19. calc-----------启动计算器
20. dfrg.msc-------磁盘碎片整理程序
21. chkdsk.exe-----Chkdsk磁盘检查
22. devmgmt.msc--- 设备管理器
23. regsvr32 /u *.dll----停止dll文件运行
24. drwtsn32------ 系统医生
25. rononce -p ----15秒关机
26. dxdiag---------检查DirectX信息
27. regedt32-------注册表编辑器
28. Msconfig.exe---系统配置实用程序
29. rsop.msc-------组策略结果集
30. mem.exe--------显示内存使用情况
31. regedit.exe----注册表
32. winchat--------XP自带局域网聊天
33. progman--------程序管理器
34. winmsd---------系统信息
35. perfmon.msc----计算机性能监测程序
2. 36. winver---------检查Windows版本
37. sfc /scannow-----扫描错误并复原
38. taskmgr-----任务管理器（2000／xp／2003
39. winver---------检查Windows版本
40. wmimgmt.msc----打开windows管理体系结构(WMI)
41. wupdmgr--------windows更新程序
42. wscript--------windows脚本宿主设置
43. write----------写字板
44. winmsd---------系统信息
45. wiaacmgr-------扫描仪和照相机向导
46. winchat--------XP自带局域网聊天
47. mem.exe--------显示内存使用情况
48. Msconfig.exe---系统配置实用程序
49. mplayer2-------简易widnows media player
50. mspaint--------画图板
51. mstsc----------远程桌面连接
52. mplayer2-------媒体播放机
53. magnify--------放大镜实用程序
54. mmc------------打开控制台
55. mobsync--------同步命令
56. dxdiag---------检查DirectX信息
57. drwtsn32------ 系统医生
58. devmgmt.msc--- 设备管理器
59. dfrg.msc-------磁盘碎片整理程序
60. diskmgmt.msc---磁盘管理实用程序
61. dcomcnfg-------打开系统组件服务
62. ddeshare-------打开DDE共享设置
63. dvdplay--------DVD播放器
64. net stop messenger-----停止信使服务
65. net start messenger----开始信使服务
66. notepad--------打开记事本
67. nslookup-------网络管理的工具向导
68. ntbackup-------系统备份和还原
69. narrator-------屏幕"讲述人"
70. ntmsmgr.msc----移动存储管理器
71. ntmsoprq.msc---移动存储管理员操作请求
72. netstat -an----(TC)命令检查接口
73. syncapp--------创建一个公文包
74. sysedit--------系统配置编辑器
75. sigverif-------文件签名验证程序
76. sndrec32-------录音机
77. shrpubw--------创建共享文件夹
78. secpol.msc-----本地安全策略
79. syskey---------系统加密，一旦加密就不能解开，保护windows xp系统的双重密码
80. services.msc---本地服务设置
81. Sndvol32-------音量控制程序
82. sfc.exe--------系统文件检查器
83. sfc /scannow---windows文件保护
84. tsshutdn-------60秒倒计时关机命令
3. 84. tsshutdn-------60秒倒计时关机命令
85. tourstart------xp简介（安装完成后出现的漫游xp程序）
86. taskmgr--------任务管理器
87. eventvwr-------事件查看器
88. eudcedit-------造字程序
89. explorer-------打开资源管理器
90. packager-------对象包装程序
91. perfmon.msc----计算机性能监测程序
92. progman--------程序管理器
93. regedit.exe----注册表
94. rsop.msc-------组策略结果集
95. regedt32-------注册表编辑器
96. rononce -p ----15秒关机
97. regsvr32 /u *.dll----停止dll文件运行
98. regsvr32 /u zipfldr.dll------取消ZIP支持
99. cmd.exe--------CMD命令提示符
100. chkdsk.exe-----Chkdsk磁盘检查
101. certmgr.msc----证书管理实用程序
102. calc-----------启动计算器
103. charmap--------启动字符映射表
104. cliconfg-------SQL SERVER 客户端网络实用程序
105. Clipbrd--------剪贴板查看器
106. conf-----------启动netmeeting
107. compmgmt.msc---计算机管理
108. cleanmgr-------垃圾整理
109. ciadv.msc------索引服务程序
110. osk------------打开屏幕键盘
111. odbcad32-------ODBC数据源管理器
112. oobe/msoobe /a----检查XP是否激活
113. lusrmgr.msc----本机用户和组
114. logoff---------注销命令
115. iexpress-------木马捆绑工具，系统自带
116. Nslookup-------IP地址侦测器
117. fsmgmt.msc-----共享文件夹管理器
118. utilman--------辅助工具管理器
119. gpedit.msc-----组策略
120. explorer-------打开资源管理器

posted @ 2010-02-18 11:04 博士阅读(3730) 评论(3) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

多线程爬虫

公告