基于Java的数据采集(三)

《基于Java的数据采集(一)》:http://www.cnblogs.com/lichenwei/p/3904715.html

《基于Java的数据采集(二)》:http://www.cnblogs.com/lichenwei/p/3905370.html

《基于Java的数据采集(终结篇)》:http://www.cnblogs.com/lichenwei/p/3910492.html

基于之前2篇Java数据采集入库,做了下功能整合,实现本地的存读取,上个效果图:

直接上代码吧,本程序只是作为"如何用JAVA抓取页面简单采集入库"的入门,在实际做采集工具的时候,还需考虑许多东西,比如当采集一个页面发生卡顿时,发生延迟时怎么办?等一系列的问题,希望这篇文字能够抛砖引玉。

先看下项目结构:

一共有五个类:

Mysql.java  --数据库操作类

RegEX.java   --正则匹配类

GetAllData.java --采集类

Action.java  --功能实现类

FootBallMain.java --主程序类

其他的,直接结合前面2篇文章外加看代码注释吧

Mysql.java

 1 package com.lcw.curl;
 2 
 3 
 4 import java.sql.Connection;
 5 import java.sql.DriverManager;
 6 import java.sql.ResultSet;
 7 import java.sql.SQLException;
 8 import java.sql.Statement;
 9 
10 
11 /**
12  * 数据库操作类,一更新,一查询
13  * @author Balla_兔子
14  *
15  */
16 public class MySql {
17   
18     //定义MySql驱动,数据库地址,数据库用户名 密码, 执行语句和数据库连接  
19     public String driver = "com.mysql.jdbc.Driver";
20     public String url = "jdbc:mysql://127.0.0.1:3306/football";
21     public String user = "root";
22     public String password = "";
23     public Statement stmt = null;
24     public Connection conn = null;
25     
26     //创建一个插入数据的方法
27     public void datatoMySql(String insertSQl) {
28 
29         try {
30             try {
31                 Class.forName(driver).newInstance();
32             } catch (Exception e) {
33                 System.out.println("Unable to find the local driver");
34                 e.printStackTrace();
35             }
36             //创建连接
37             conn = DriverManager.getConnection(url, user, password);
38             //创建一个 Statement 对象来将 SQL 语句发送到数据库
39             stmt = conn.createStatement();
40         } catch (SQLException e) {
41             e.printStackTrace();
42         }
43         try {
44             //执行SQL 插入语句
45             stmt.executeUpdate(insertSQl);
46         } catch (SQLException e) {
47             e.printStackTrace();
48         }
49         try {
50             stmt.close();
51             conn.close();
52         } catch (SQLException e) {
53             e.printStackTrace();
54         }
55     }
56     
57     
58   //创建一个查找数据的方法
59     public ResultSet searchMySql(String selectSQl) {
60         
61         ResultSet rs=null;
62 
63         try {
64             try {
65                 Class.forName(driver).newInstance();
66             } catch (Exception e) {
67                 System.out.println("Unable to find the local driver");
68                 e.printStackTrace();
69             }
70             //创建连接
71             conn = DriverManager.getConnection(url, user, password);
72             //创建一个 Statement 对象来将 SQL 语句发送到数据库
73             stmt = conn.createStatement();
74         } catch (SQLException e) {
75             e.printStackTrace();
76         }
77         try {
78             //执行SQL 插入语句
79             rs=stmt.executeQuery(selectSQl);
80         } catch (SQLException e) {
81             e.printStackTrace();
82         }
83         
84         return rs;
85     }
86     
87 }
Mysql.java

RegEX.java

 1 package com.lcw.curl;
 2 
 3 import java.util.regex.Matcher;
 4 import java.util.regex.Pattern;
 5 
 6 public class RegEX {
 7 
 8     /**
 9      * 
10      * @param regex
11      * 正则表达式
12      * @param content
13      * 所要匹配的内容
14      * @return
15      */
16     public String getData(String regex, String content) {
17         Pattern pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);// 设定正则表达式,不区分大小写
18         Matcher matcher = pattern.matcher(content);
19         if (matcher.find()) {
20             return matcher.group();//返回正则匹配结果
21         } else {
22             return "";
23         }
24     }
25 
26 }
RegEX.java

GetAllData.java

 1 package com.lcw.curl;
 2 
 3 import java.io.BufferedReader;
 4 import java.io.InputStreamReader;
 5 import java.net.URL;
 6 
 7 public class GetAllData {
 8 
 9     /**采集类
10      * @param Balla_兔子
11      */
12     public void getAllData() {
13 
14         try {
15             String address = "http://www.footballresults.org/league.php?league=EngDiv1";
16             URL url = new URL(address);
17             InputStreamReader inputStreamReader = new InputStreamReader(url
18                     .openStream(), "utf-8");// 打开地址,以UTF-8编码的形式返回字节并转为字符
19             BufferedReader bufferedReader = new BufferedReader(
20                     inputStreamReader);// 从字符输入流中读取文本,缓冲各个字符,从而提供字符、数组和行的高效读取。
21 
22             RegEX data = new RegEX();
23             MySql mySql = new MySql();
24             String content = "";// 用来接受每次读取的行字符
25             int flag = 0;// 标志,队伍信息刚好在日期信息后面,则正则相同,用于分离数据
26             String dateRegex = "\\d{1,2}\\.\\d{1,2}\\.\\d{4}";// 日期匹配正则表达式
27             String teamRegex = ">[^<>]*</a>";// 队伍匹配正则表达式
28             String scoreRegex = ">(\\d{1,2}-\\d{1,2})</TD>";// 比分正则表达式
29             String tempDate = "";// 存储临时比赛时间
30             String teama = "";// 存储临时主队
31             String teamb = "";// 存储临时客队
32             String score = "";// 存储临时比分
33             int i = 0;// 记录信息条数
34             String sql = "";// 数据库语句
35 
36             while ((content = bufferedReader.readLine()) != null) {// 每次读取一行数据
37                 // 获取比赛日期信息
38                 String dateInfo = data.getData(dateRegex, content);
39                 if (!dateInfo.equals("")) {
40                     // System.out.println("日期:" + dateInfo);
41                     tempDate = dateInfo;
42                     flag++;
43                 }
44                 // 获取队伍信息,需先读到日期信息让标志符自增
45                 String teamInfo = data.getData(teamRegex, content);
46                 if (!teamInfo.equals("") && flag == 1) {
47                     teama = teamInfo.substring(1, teamInfo.indexOf("</a>"));
48                     // System.out.println("主队:" + teama);
49                     flag++;
50                 } else if (!teamInfo.equals("") && flag == 2) {
51                     teamb = teamInfo.substring(1, teamInfo.indexOf("</a>"));
52                     // System.out.println("客队:" + teamb);
53                     flag = 0;
54                 }
55                 // 获取比分信息
56                 String scoreInfo = data.getData(scoreRegex, content);
57                 if (!scoreInfo.equals("")) {
58                     score = scoreInfo.substring(1, scoreInfo.indexOf("</TD>"));
59                     // System.out.println("比分:" + score);
60                     // System.out.println();
61                     i++;
62                     sql = "insert into football(`date`,`teama`,`teamb`,`score`) values('"
63                             + tempDate
64                             + "','"
65                             + teama
66                             + "','"
67                             + teamb
68                             + "','"
69                             + score + "')";
70                     mySql.datatoMySql(sql);
71                     System.out.println("存储数据成功:" + i + "条");
72                 }
73 
74             }
75             bufferedReader.close();
76             // System.out.println("一共收集到了" + i + "条信息");
77         } catch (Exception e) {
78             e.printStackTrace();
79         }
80 
81     }
82 
83 }
GetAllData.java

Action.java

  1 package com.lcw.curl;
  2 
  3 import java.sql.ResultSet;
  4 import java.sql.SQLException;
  5 import java.util.ArrayList;
  6 import java.util.List;
  7 import java.util.Vector;
  8 
  9 public class Action {
 10 
 11     /**
 12      * 操作一:初始化数据库数据
 13      */
 14     public void initData() {
 15         String sql = "delete from football";
 16         MySql doMySql = new MySql();
 17         try {
 18             doMySql.datatoMySql(sql);
 19             System.out.println("数据初始化完毕!");
 20         } catch (Exception e) {
 21             System.out.println("数据初始化失败!");
 22         }
 23 
 24     }
 25 
 26     /**
 27      * 获取所有队伍信息
 28      * 
 29      * @return
 30      */
 31     public Vector<String> getAllTeam() {
 32         ResultSet rs = null;
 33         Vector<String> vector = new Vector<String>();
 34         String sql = "select teama,teamb from football";
 35         MySql doMySql = new MySql();
 36         rs = doMySql.searchMySql(sql);
 37 
 38         try {
 39             while (rs.next()) {
 40                 try {
 41                     if (!vector.contains(rs.getString("teama"))) {
 42                         vector.add(rs.getString("teama"));
 43                     }
 44                     if (!vector.contains(rs.getString("teamb"))) {
 45                         vector.add(rs.getString("teamb"));
 46                     }
 47                 } catch (SQLException e) {
 48                     e.printStackTrace();
 49                 }
 50             }
 51         } catch (SQLException e) {
 52             e.printStackTrace();
 53         }
 54 
 55         return vector;
 56 
 57     }
 58 
 59     /**
 60      * 获取具体某队的比赛信息
 61      * 
 62      * @param team
 63      * @return
 64      */
 65     public List<String> findTeam(String team) {
 66         List<String> list = new ArrayList<String>();
 67         String sql = "select * from football where teama ='" + team
 68                 + "' or teamb ='" + team + "'";
 69         MySql mysql = new MySql();
 70         ResultSet rs = null;
 71         rs = mysql.searchMySql(sql);
 72         try {
 73             while (rs.next()) {
 74                 list.add(rs.getString("date"));
 75                 list.add(rs.getString("teama"));
 76                 list.add(rs.getString("teamb"));
 77                 list.add(rs.getString("score"));
 78             }
 79         } catch (SQLException e) {
 80             e.printStackTrace();
 81         }
 82         return list;
 83 
 84     }
 85 
 86     public List<String> findGame(String date) {
 87         List<String> list = new ArrayList<String>();
 88         ResultSet rs = null;
 89         String sql = "select * from football where date ='" + date + "'";
 90         MySql mysql = new MySql();
 91         rs = mysql.searchMySql(sql);
 92         try {
 93             while (rs.next()) {
 94                 list.add(rs.getString("date"));
 95                 list.add(rs.getString("teama"));
 96                 list.add(rs.getString("teamb"));
 97                 list.add(rs.getString("score"));
 98             }
 99         } catch (SQLException e) {
100             // TODO Auto-generated catch block
101             e.printStackTrace();
102         }
103         return list;
104     }
105 
106 }
Action.java

FootBallMain.java

 1 package com.lcw.curl;
 2 
 3 import java.util.List;
 4 import java.util.Scanner;
 5 import java.util.Vector;
 6 
 7 public class FootBallMain {
 8 
 9     /**主程序类
10      * @param Balla_兔子
11      */
12     public static void main(String[] args) {
13         GetAllData allData = new GetAllData();
14         Action action = new Action();
15 
16         while (true) {
17             System.out.println("①初始化数据库-请按 (1)");
18             System.out.println("②自动化采集数据-请按(2)");
19             System.out.println("③查询参赛队伍-请按(3)");
20             System.out.println("④查询具体球队比赛结果-请按(4)");
21             System.out.println("⑤查询具体某天的比赛详情-请按(5)");
22             Scanner scanner = new Scanner(System.in);
23             String input = scanner.next();
24             if (input.equals("1")) {
25                 System.out.println();
26                 action.initData();
27                 System.out
28                         .println("-----------------------------------------------------");
29             } else if (input.equals("2")) {
30                 System.out.println("正在采集数据...请稍后");
31                 allData.getAllData();
32                 System.out
33                         .println("-----------------------------------------------------");
34             } else if (input.equals("3")) {
35                 Vector<String> allTeam = action.getAllTeam();
36                 System.out.println("正在获取数据...请稍后");
37                 if (allTeam.size() != 0) {
38                     System.out.println("参赛队伍如下:");
39                     for (int i = 0; i < allTeam.size(); i++) {
40                         System.out.println(allTeam.get(i));
41                     }
42                 }
43                 System.out
44                         .println("-----------------------------------------------------");
45             } else if (input.equals("4")) {
46                 System.out.println("请输入您要查询的队伍名:");
47                 String team = scanner.next();
48                 List<String> list = action.findTeam(team);
49                 System.out.println("比赛日期\t\t\t主队\t\t客队\t\t\t比赛结果");
50                 if (list.size() != 0) {
51                     for (int i = 0; i < list.size(); i++) {
52                         System.out.print(list.get(i) + "\t\t");
53                     }
54                 } else {
55                     System.out.println("暂时没有您所提供队伍的比赛信息,敬请关注...");
56                 }
57                 System.out.println();
58                 System.out
59                         .println("-----------------------------------------------------");
60             } else if (input.equals("5")) {
61                 System.out.println("请输入您要查询日期(格式如下:xx.xx.xxxx):");
62                 String date = scanner.next();
63                 List<String> info = action.findGame(date);
64                 System.out.println("比赛日期\t\t\t主队\t\t客队\t\t\t比赛结果");
65                 if (info.size() != 0) {
66                     for (int i = 0; i < info.size(); i++) {
67                         if (i % 4 == 0 && i != 0) {
68                             System.out.println();
69                         }
70                         System.out.print(info.get(i) + "\t\t");
71                     }
72                 } else {
73                     System.out.println("暂时没有您所提供的比赛信息,敬请关注...");
74                 }
75                 System.out.println();
76                 System.out
77                         .println("------------------------------------------------------------------------");
78             } else {
79                 System.out.println("请输入正确的对应编号..");
80                 System.out
81                         .println("------------------------------------------------------------------------");
82             }
83         }
84     }
85 
86 }
FootBallMain.java

 

posted @ 2014-08-12 12:12  李晨玮  阅读(4472)  评论(0编辑  收藏  举报