自己写的一个java采集有关的简单类

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/** *//**
* 读取html页面内容
* Html页面内容获取处理页面
*/
public class ReadHtml {

        /** *//**
         * 读取html页面内容CED块的html源文件代码
         * @param   urlPath 网页地址
         * urlPath 网页地址格式 http://whois.asia/cgi-bin/whois.cgi?whois_query_field=today
         */
        public static String GetCEDSource(String urlPath)
        {
            StringBuffer document = new StringBuffer();
            try
            {
               if(urlPath==""||urlPath==null)
               {
                   urlPath="http://whois.asia/cgi-bin/whois.cgi?whois_query_field=today";
               }
               URL url = new URL(urlPath);
                URLConnection conn = url.openConnection();
                BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
                String line = null;
                while ((line = reader.readLine()) != null)
                document.append(line + " ");
                reader.close();
            }
            catch (Exception e)
            {
                e.printStackTrace();
            }

            return document.toString();
        }
        /** *//**         *
         * @param url 地址格式例如: http://whois.asia/cgi-bin/whois.cgi?whois_query_field=today
         * @return 字符串数组 String[] 返回页面的CED块信息
         */
        public static String[] GetCEDContent(String url){
           return GetCEDContent(url,"");
        }

        /** *//**         *
         * @return 字符串数组 String[] 返回页面的CED块信息
         */
        public static String[] GetCEDContent(){
           return GetCEDContent("","");
        }

        /** *//**         *
         * @param url 地址格式例如: http://whois.asia/cgi-bin/whois.cgi?whois_query_field=today
         * @param keyword 正则要匹配的内容块所包括的名称
         * @return 字符串数组 String[] 返回页面的CED块信息
         */
        public static String[] GetCEDContent(String url, String keyword)
        {
           String input = ReadHtml.GetCEDSource(url);
           ///匹配包括 <tr>...</tr>
           if(keyword==""||keyword==null)
           {
               keyword ="ced";
           }
           String pipei = "<tr[^>]*>[\\s]*<td[^>]*>[\\s]*("+keyword+"){1}[\\s\\S]*?<\\/td>[\\s]*<\\/tr>";
           Pattern pattern = Pattern.compile(pipei, Pattern.CASE_INSENSITIVE);
           Matcher matcher = pattern.matcher(input);
           ///正则所匹配的<tr>...</tr>信息
           String match="";
            while (matcher.find())
               {
                   int start = matcher.start();
                    int end = matcher.end();
                    match += input.substring(start, end);
                }
            //System.out.println(match);
            ///匹配<td></td>
            String regex2 = "<td[^>]*>[\\s\\S]*?<\\/td>";
            Pattern pattern2 = Pattern.compile(regex2, Pattern.CASE_INSENSITIVE);
            Matcher matcher2 = pattern2.matcher(match);
            ///正则所匹配的<td></td>信息
            String matchstr ="";
            while (matcher2.find())
           {
               int start = matcher2.start();
                int end = matcher2.end();
                matchstr += match.substring(start, end);
            }
            String content = ReplaceStr(matchstr);
            content = content.substring(1,content.length()-1);

            ///取出<a 的标记
            String regex3="<a[^>]*>[\\s\\S]*?<\\/a>";
            Pattern pattern3 = Pattern.compile(regex3, Pattern.CASE_INSENSITIVE);
            Matcher matcher3 = pattern3.matcher(content);
            String result="";
            while (matcher3.find())
           {
               ///把<a>..</a>标记清除掉
               result = matcher3.replaceAll("");
            }
            return SplitStr(result);
        }


        /** *//**         *
         * @param url   网页地址
         * @param keyword 正则要匹配的内容块所包括的名字
         * @return 字符串数组 String[] 返回页面的一块html信息
         * 获取html页面的一块以字符串数组的形式显示
         */
        public static String[] GetHtmlContentByGroup(String url,String keyword)
        {
           String input = ReadHtml.GetCEDSource(url);
           if(keyword==""||keyword==null)
           {
               keyword ="ced";
           }
           ///匹配包括 <tr>...</tr>
           String pipei = "<tr[^>]*>[\\s]*<td[^>]*>[\\s]*("+keyword+"){1}[\\s\\S]*?<\\/td>[\\s]*<\\/tr>";
           Pattern pattern = Pattern.compile(pipei, Pattern.CASE_INSENSITIVE);
           Matcher matcher = pattern.matcher(input);
           String s0="";
            while (matcher.find()) {
                s0 += matcher.group();

            }
            ///匹配<td></td>
            String regex2 = "<td[^>]*>[\\s\\S]*?<\\/td>";
            Pattern p2 = Pattern.compile(regex2, Pattern.CASE_INSENSITIVE);
            Matcher matcher2 = p2.matcher(s0);
            String matchstr ="";
            while (matcher2.find())
           {
               matchstr += matcher2.group();
            }
            String yy = ReplaceStr(matchstr);
            yy = yy.substring(1, yy.length()-1);
            ///取出<a 的标记
            String r3="<a[^>]*>[\\s\\S]*?<\\/a>";
            Pattern p3 = Pattern.compile(r3, Pattern.CASE_INSENSITIVE);
            Matcher matcher3 = p3.matcher(yy);
            String m3="";
            while (matcher3.find())
           {
               m3 = matcher3.replaceAll("");
            }
            return SplitStr(m3);
        }


        /** *//**         *
         * @param str   要进行分割处理的字符串
         * @return 字符串数组 String[]
         */
        public static String[] SplitStr(String str)
        {
           if(str==""|| str==null)
               return null;
           else
               return str.split(",");
        }

        /** *//**         *
         * @param str   要进行替换处理的字符串(<td></td>)
         * @return 字符串数组 String[]   清除掉<td>标记
         */
        public static String ReplaceStr(String str)
        {
           str = str.replace("<td>", ",");
           str = str.replace("</td>", ",");
           return str;
        }


        /** *//**
         * main方法测试
         * @param s
         * @throws IOException
         */
        public static void main(String[] s) throws IOException {
           String[] test = GetCEDContent();
           for(String html : test)
           {
               System.out.println(html);
           }
        }
}

posted on 2009-08-28 18:00 ruonanxiao 阅读(710) 评论(1) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

ruonanxiao

自己写的一个java采集有关的简单类

导航

公告