java在文本处理中的相关辅助工具类

1,java分词

package com.bobo.util;

import ICTCLAS.I3S.AC.ICTCLAS50;

public class Cutwords {
    public static String Segment(String microblog) {
        String textSeg = "";
        try {
            ICTCLAS50 testICTCLAS50 = new ICTCLAS50();
            String argu = ".";
            testICTCLAS50.ICTCLAS_Init(argu.getBytes("GB2312"));

            String sInput = microblog;

            byte nativeBytes[] = testICTCLAS50.ICTCLAS_ParagraphProcess(
                    sInput.getBytes("GB2312"), 0, 0);
            String nativeStr = new String(nativeBytes, 0, nativeBytes.length,
                    "GB2312");

            textSeg = nativeStr;

        } catch (Exception ex) {

        }
        return textSeg;
    }
}
CutWords

2,java文件读写

package com.bobo.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

public class FileUtil {

    public static ArrayList<String> FileList = new ArrayList<String>();

    /**
     * 列出某個目錄及其子目錄下所有的文件列表
     * 
     * @param filepath
     *            目錄路徑
     * @return 該路徑及其子路經下的所有文件名列表
     * @throws FileNotFoundException
     * @throws IOException
     */
    public static List<String> readDirs(String filepath)
            throws FileNotFoundException, IOException {
        try {
            File file = new File(filepath);
            if (!file.isDirectory()) {
                System.out.println("输入的不是目錄名称;");
                System.out.println("filepath:" + file.getAbsolutePath());
            } else {
                String[] flist = file.list();
                for (int i = 0; i < flist.length; i++) {
                    File newfile = new File(filepath + "/" + flist[i]);
                    if (!newfile.isDirectory()) {
                        FileList.add(newfile.getAbsolutePath());
                    } else if (newfile.isDirectory()) {
                        readDirs(filepath + "/" + flist[i]);
                    }
                }
            }
        } catch (FileNotFoundException e) {
            System.out.println(e.getMessage());
        }
        return FileList;
    }

    /**
     * 讀取文件內容,以字符串的方式返回
     * 
     * @param file
     *            需要讀取的文件名
     * @return 返回讀取的文件內容構成的字符串,行之間用\r\n進行分割
     * @throws FileNotFoundException
     * @throws IOException
     */
    public static String readFile(String file) throws FileNotFoundException,
            IOException {
        StringBuffer strSb = new StringBuffer(); // String is constant,
                                                    // StringBuffer can be
                                                    // changed.
        InputStreamReader inStrR = new InputStreamReader(new FileInputStream(
                file), "gbk"); // byte streams to character streams
        BufferedReader br = new BufferedReader(inStrR);
        String line = br.readLine();
        while (line != null) {
            strSb.append(line).append("\r\n");
            line = br.readLine();
        }

        return strSb.toString();
    }
    // 其他,一般读取文件的时候,利用bufferedReader方便,逐行写入文件的时候利用printStream比较方便

}
FileUtil

 

3,字符串工具类

package com.bobo.util;

import java.util.Stack;
import java.util.regex.Pattern;

public class StringUtil {
    /**
     * 查找左右匹配型符号的位置
     * 
     * @param str
     *            需要查找的字符串
     * @param cLeft
     *            左侧符号
     * @param cRight
     *            右侧符号
     * @return 返回和第一个左侧符号匹配的右侧符号位置,否则返回-1
     */

    public static int findRightMatchChar(String str, String cLeft, String cRight) {
        Stack<Integer> stack = new Stack<Integer>();
        boolean pushAtLeastOnce = false;
        for (int i = 0; i < str.length(); i++) {

            if (str.substring(i, i + 1).equals(cLeft)) {
                stack.push(i);
                pushAtLeastOnce = true;
            }
            if (str.substring(i, i + 1).equals(cRight)) {
                stack.pop();
            }

            if (pushAtLeastOnce && stack.isEmpty()) {
                return i;
            }
        }
        return -1;
    }

    /**
     * 判断是否为null或空�?
     * 
     * @param str
     *            String
     * @return true or false
     */
    public static boolean isNullOrEmpty(String str) {
        return str == null || str.trim().length() == 0;
    }

    /**
     * 判断str1和str2是否相同
     * 
     * @param str1
     *            str1
     * @param str2
     *            str2
     * @return true or false
     */
    public static boolean equals(String str1, String str2) {
        return str1 == str2 || str1 != null && str1.equals(str2);
    }

    /**
     * 判断str1和str2是否相同(不区分大小写)
     * 
     * @param str1
     *            str1
     * @param str2
     *            str2
     * @return true or false
     */
    public static boolean equalsIgnoreCase(String str1, String str2) {
        return str1 != null && str1.equalsIgnoreCase(str2);
    }

    /**
     * 判断字符串str1是否包含字符串str2
     * 
     * @param str1
     *            源字符串
     * @param str2
     *            指定字符�?
     * @return true源字符串包含指定字符串,false源字符串不包含指定字符串
     */
    public static boolean contains(String str1, String str2) {
        return str1 != null && str1.contains(str2);
    }

    /**
     * 判断字符串是否为空,为空则返回一个空值,不为空则返回原字符串
     * 
     * @param str
     *            待判断字符串
     * @return 判断后的字符�?
     */
    public static String getString(String str) {
        return str == null ? "" : str;
    }
    /**
     * 判断字符串是否为数字
     * @param str
     * @return 
     */
    public static boolean isNumeric(Object str) {
        Pattern pattern = Pattern.compile("[0-9]*");
        return pattern.matcher(str.toString()).matches();
    }
    /**
     * 判断字符串是否为英文字母
     * @param str
     * @return
     */
    public static boolean isEnglish(Object str) {
        Pattern pattern = Pattern.compile("[a-z]*");
        return pattern.matcher(str.toString()).matches();
                 
    }
}
StringUtil

 4,在java中运行shell命令的相关工具类

package com.bobo.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

public class CommandHelper {
    // default time out, in millseconds
    public static int DEFAULT_TIMEOUT;
    public static final int DEFAULT_INTERVAL = 1000;
    public static long START;

    public static void main(String[] args) {
        DEFAULT_TIMEOUT = 10000;
        try {
            System.out
                    .println(new CommandHelper().exec("wc -l *.*").toString());
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (InterruptedException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    public static CommandResult exec(String command) throws IOException,
            InterruptedException {
        Process process = Runtime.getRuntime().exec(command);
        CommandResult commandResult = wait(process);
        if (process != null) {
            process.destroy();
        }
        return commandResult;
    }

    private static boolean isOverTime() {
        return System.currentTimeMillis() - START >= DEFAULT_TIMEOUT;
    }

    private static CommandResult wait(Process process)
            throws InterruptedException, IOException {
        BufferedReader errorStreamReader = null;
        BufferedReader inputStreamReader = null;
        try {
            errorStreamReader = new BufferedReader(new InputStreamReader(
                    process.getErrorStream()));
            inputStreamReader = new BufferedReader(new InputStreamReader(
                    process.getInputStream()));

            // timeout control
            START = System.currentTimeMillis();
            boolean isFinished = false;

            for (;;) {
                if (isOverTime()) {
                    CommandResult result = new CommandResult();
                    result.setExitValue(CommandResult.EXIT_VALUE_TIMEOUT);
                    result.setOutput("Command process timeout");
                    return result;
                }

                if (isFinished) {
                    CommandResult result = new CommandResult();
                    result.setExitValue(process.waitFor());

                    // parse error info
                    if (errorStreamReader.ready()) {
                        StringBuilder buffer = new StringBuilder();
                        String line;
                        while ((line = errorStreamReader.readLine()) != null) {
                            buffer.append(line);
                        }
                        result.setError(buffer.toString());
                    }

                    // parse info
                    if (inputStreamReader.ready()) {
                        StringBuilder buffer = new StringBuilder();
                        String line;
                        while ((line = inputStreamReader.readLine()) != null) {
                            buffer.append(line);
                        }
                        result.setOutput(buffer.toString());
                    }
                    return result;
                }

                try {
                    isFinished = true;
                    process.exitValue();
                } catch (IllegalThreadStateException e) {
                    // process hasn't finished yet
                    isFinished = false;
                    Thread.sleep(DEFAULT_INTERVAL);
                }
            }

        } finally {
            if (errorStreamReader != null) {
                try {
                    errorStreamReader.close();
                } catch (IOException e) {
                }
            }

            if (inputStreamReader != null) {
                try {
                    inputStreamReader.close();
                } catch (IOException e) {
                }
            }
        }
    }
}
CommandHelper

 

package com.bobo.util;

public class CommandResult {
    public static final int EXIT_VALUE_TIMEOUT = -1;

    private String output;

    void setOutput(String error) {
        output = error;
    }

    public String getOutput() {
        return output;
    }

    int exitValue;

    void setExitValue(int value) {
        exitValue = value;
    }

    int getExitValue() {
        return exitValue;
    }

    private String error;

    /**
     * @return the error
     */
    public String getError() {
        return error;
    }

    /**
     * @param error
     *            the error to set
     */
    public void setError(String error) {
        this.error = error;
    }

    @Override
    public String toString() {

        return "output:" + this.output + ";error:" + this.error + ";exitValue:"
                + this.exitValue;
    }
}
CommandResult

 5,过滤某个目录下以特定后缀结尾的文件

package com.bobo.myinterface;

import java.io.File;
import java.io.FileFilter;

public class MyFileFilter implements FileFilter {
    private String suffix;

    public MyFileFilter(String suffix) {
        this.suffix = suffix;
    }

    @Override
    public boolean accept(File arg0) {
        if (arg0.isDirectory() || arg0.getAbsolutePath().endsWith(this.suffix)) {
            return true;
        } else {
            return false;
        }
    }

}
文件过滤器

在fileUtil中添加showAllFile方法

    public static void showAllFiles(File dir,FileFilter filter,ArrayList<File> fileList) {
        File[] fs = dir.listFiles(filter);
        for (int i = 0; i < fs.length; i++) {
            if (fs[i].isDirectory()) {
                    showAllFiles(fs[i],filter,fileList);
            }else{
                System.out.println(fs[i].getAbsolutePath());
                fileList.add(fs[i]);
            }
        }
     
    }
showAllFile方法

最终调用

    File dataDir = new File(Constants.DataDir);
        // 得到所有标注过的数据
        ArrayList<File> fileList = new ArrayList<File>();
        FileUtil.showAllFiles(dataDir, new MyFileFilter(".dealed"), fileList);
        System.out.println(fileList.size());
        
列举特定后缀文件的调用方法

 

posted @ 2014-01-06 10:16  bobo的学习笔记  阅读(473)  评论(0编辑  收藏  举报