import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
public class ReadFiles
{
//返回:给定目录下文件的个数
public static int GetFileNum(String pathName)
{
File file=new File(pathName);
File[] nextFiles=file.listFiles();
return nextFiles.length;
}
//返回:一个关于所有文件名的列表
public static ArrayList<String> GetFileName(String pathName) throws IOException
{
File fileHam=new File(pathName+"\\ham");
File fileSpam=new File(pathName+"\\spam");
File[] hamFiles=fileHam.listFiles();
File[] spamFiles=fileSpam.listFiles();
ArrayList<String> fileName=new ArrayList<String>();
for(int i=0;i<hamFiles.length;i++)
{
fileName.add(hamFiles[i].getPath());
}
for(int i=0;i<spamFiles.length;i++)
{
fileName.add(spamFiles[i].getPath());
}
return fileName;
}
//返回:一个关于所有文件单词的列表
public static ArrayList<String> GetWordsList(String pathName) throws IOException
{
File fileHam=new File(pathName+"\\ham");
File fileSpam=new File(pathName+"\\spam");
File[] hamFiles=fileHam.listFiles();
File[] spamFiles=fileSpam.listFiles();
HashSet<String> set=new HashSet<String>();
for(int i=0;i<hamFiles.length;i++)
{
BufferedReader in=new BufferedReader(new InputStreamReader(new FileInputStream(hamFiles[i])));
String s=null;
while((s=in.readLine())!=null)
{
String sMatch = "\\d+.\\d+|\\w+|\\$";
Pattern pattern=Pattern.compile(sMatch);
Matcher ma=pattern.matcher(s);
while(ma.find())
{
set.add(ma.group().toLowerCase());
}
}
in.close();
}
for(int i=0;i<spamFiles.length;i++)
{
BufferedReader in=new BufferedReader(new InputStreamReader(new FileInputStream(spamFiles[i])));
String s=null;
while((s=in.readLine())!=null)
{
String sMatch = "\\d+.\\d+|\\w+|\\$";
Pattern pattern=Pattern.compile(sMatch);
Matcher ma=pattern.matcher(s);
while(ma.find())
{
set.add(ma.group().toLowerCase());
}
}
in.close();
}
ArrayList<String> wordList=new ArrayList<String>(set);
return wordList;
}
//返回:一个关于单词的处理而得的矩阵
public static ArrayList<MyArray> GetMatrix(String pathName,ArrayList<String> wordList) throws IOException
{
ArrayList<MyArray> trainMatrix=new ArrayList<MyArray>();
File fileHam=new File(pathName+"\\ham");
File fileSpam=new File(pathName+"\\spam");
File[] hamFiles=fileHam.listFiles();
File[] spamFiles=fileSpam.listFiles();
for(int i=0;i<hamFiles.length;i++)
{
BufferedReader in=new BufferedReader(new InputStreamReader(new FileInputStream(hamFiles[i])));
MyArray wordArray=new MyArray(wordList.size());
wordArray.InitArray(0);
String s=null;
while((s=in.readLine())!=null)
{
String sMatch = "\\d+.\\d+|\\w+|\\$";
Pattern pattern=Pattern.compile(sMatch);
Matcher ma=pattern.matcher(s);
while(ma.find()){
int pos=wordList.indexOf(ma.group().toLowerCase());
if(pos!=-1)
wordArray.SetPos(pos);
}
}
trainMatrix.add(wordArray);
in.close();
}
for(int i=0;i<spamFiles.length;i++)
{
BufferedReader in=new BufferedReader(new InputStreamReader(new FileInputStream(spamFiles[i])));
MyArray wordArray=new MyArray(wordList.size());
wordArray.InitArray(0);
String s=null;
while((s=in.readLine())!=null)
{
String sMatch = "\\d+.\\d+|\\w+|\\$";
Pattern pattern=Pattern.compile(sMatch);
Matcher ma=pattern.matcher(s);
while(ma.find()){
int pos=wordList.indexOf(ma.group().toLowerCase());
if(pos!=-1)
wordArray.SetPos(pos);
}
}
trainMatrix.add(wordArray);
in.close();
}
return trainMatrix;
}
}