ReadFiles

import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;

import java.util.regex.Pattern;  
import java.util.regex.Matcher;  

public class ReadFiles
{

    //返回:给定目录下文件的个数
    public static int GetFileNum(String pathName)
    {
        File file=new File(pathName);
        File[] nextFiles=file.listFiles();
        return nextFiles.length;
    }
    
    //返回:一个关于所有文件名的列表
    public static ArrayList<String> GetFileName(String pathName) throws IOException
    {
        File fileHam=new File(pathName+"\\ham");
        File fileSpam=new File(pathName+"\\spam");
        
        File[] hamFiles=fileHam.listFiles();
        File[] spamFiles=fileSpam.listFiles();
        
        ArrayList<String> fileName=new ArrayList<String>();
        
        for(int i=0;i<hamFiles.length;i++)
        {
            fileName.add(hamFiles[i].getPath());
        }
        
        for(int i=0;i<spamFiles.length;i++)
        {
            fileName.add(spamFiles[i].getPath());
        }
        
        return fileName;
    }
    
    //返回:一个关于所有文件单词的列表
    public static ArrayList<String> GetWordsList(String pathName) throws IOException
    {
        File fileHam=new File(pathName+"\\ham");
        File fileSpam=new File(pathName+"\\spam");
        
        File[] hamFiles=fileHam.listFiles();
        File[] spamFiles=fileSpam.listFiles();
        
        HashSet<String> set=new HashSet<String>();
        for(int i=0;i<hamFiles.length;i++)
        {
            BufferedReader in=new BufferedReader(new InputStreamReader(new FileInputStream(hamFiles[i])));
            String s=null;
            while((s=in.readLine())!=null)
            {
                
                String sMatch = "\\d+.\\d+|\\w+|\\$";
                Pattern  pattern=Pattern.compile(sMatch);  
                Matcher  ma=pattern.matcher(s);  
           
                while(ma.find())
                {  
                    set.add(ma.group().toLowerCase());
                } 
                
            }
            in.close();
        }
        
        for(int i=0;i<spamFiles.length;i++)
        {
            BufferedReader in=new BufferedReader(new InputStreamReader(new FileInputStream(spamFiles[i])));
            String s=null;
            while((s=in.readLine())!=null)
            {
                
                String sMatch = "\\d+.\\d+|\\w+|\\$";
                Pattern  pattern=Pattern.compile(sMatch);  
                Matcher  ma=pattern.matcher(s);  
           
                while(ma.find())
                {  
                    set.add(ma.group().toLowerCase());
                } 
                
            }
            in.close();
        }
        
        ArrayList<String> wordList=new ArrayList<String>(set);
        return wordList;
        
    }
    
    //返回:一个关于单词的处理而得的矩阵
    public static ArrayList<MyArray> GetMatrix(String pathName,ArrayList<String> wordList) throws IOException
    {
        ArrayList<MyArray> trainMatrix=new ArrayList<MyArray>();
        
        File fileHam=new File(pathName+"\\ham");
        File fileSpam=new File(pathName+"\\spam");
        
        File[] hamFiles=fileHam.listFiles();
        File[] spamFiles=fileSpam.listFiles();
        
        
        for(int i=0;i<hamFiles.length;i++)
        {
            BufferedReader in=new BufferedReader(new InputStreamReader(new FileInputStream(hamFiles[i])));
            MyArray wordArray=new MyArray(wordList.size());
            wordArray.InitArray(0);
            String s=null;
            while((s=in.readLine())!=null)
            {
                
                String sMatch = "\\d+.\\d+|\\w+|\\$";
                Pattern  pattern=Pattern.compile(sMatch);  
                Matcher  ma=pattern.matcher(s);  
                
                while(ma.find()){  
                    int pos=wordList.indexOf(ma.group().toLowerCase());
                    if(pos!=-1)
                        wordArray.SetPos(pos);
                    
                } 
                
            }
            trainMatrix.add(wordArray);
            in.close();
        }
        
        
        for(int i=0;i<spamFiles.length;i++)
        {
            BufferedReader in=new BufferedReader(new InputStreamReader(new FileInputStream(spamFiles[i])));
            MyArray wordArray=new MyArray(wordList.size());
            wordArray.InitArray(0);
            String s=null;
            while((s=in.readLine())!=null)
            {
                
                String sMatch = "\\d+.\\d+|\\w+|\\$";
                Pattern  pattern=Pattern.compile(sMatch);  
                Matcher  ma=pattern.matcher(s);  
                
                while(ma.find()){  
                    int pos=wordList.indexOf(ma.group().toLowerCase());
                    if(pos!=-1)
                        wordArray.SetPos(pos);
                    
                } 
                
            }
            trainMatrix.add(wordArray);
            in.close();
        }
        
        return trainMatrix;
        
    }
}

 

posted @ 2014-09-19 21:29  爱吃萝卜干  阅读(305)  评论(0编辑  收藏  举报