简单的java采集程序二
继【简单的java采集程序】,这里将完成对整个网站的号码段的采集任务。
【使用预编译+批处理采集网页内容至数据库表中】
在之前我们用statement类来创建sql语句的执行对象,以实现插入字段到数据库的操作,但由于插入的数据量较大,如果继续用statement操作话,会很耗时间,我们用其子类PreparedStatement来进行操作。
PreparedStatement 可以实现sql语句的预编译,我们只需要通过其setString()方法传参即可,这样不仅效率提高了,而且也会更加安全,可防止SQL注入。推荐相关文章
另外我们还可以调用其addBatch()方法 和 executeBatch()实现批量插入处理。
代码如下,喜欢把数据库链接作为一个单独的类
import java.sql.DriverManager; import java.sql.SQLException; import com.mysql.jdbc.Connection; public class database { public static String driver ="com.mysql.jdbc.Driver"; public static String url="jdbc:mysql://127.0.0.1:3306/tele_dat?autoReconnect=true&characterEncoding=UTF-8"; public static String user ="root"; public static String password = "123456"; public static java.sql.Connection conn = null; //返回一个数据库连接对象 public static Connection ConnectToDataBase(){ try { Class.forName(driver); } catch (ClassNotFoundException e) { System.out.println("加载驱动失败"); e.printStackTrace(); } try { conn = DriverManager.getConnection(url, user, password); System.out.println("连接成功"); } catch (SQLException e) { System.out.println("连接出问题了"); e.printStackTrace(); } return (Connection) conn; } //测试连接数据库 public static void main(String args[]) { database.ConnectToDataBase(); } }
主体程序
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.sql.Connection; import java.sql.PreparedStatement; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Crawl { private static Connection conn = database.ConnectToDataBase(); static String home_url = "http://www.hiphop8.com"; //网站首页 static String pattern_pro_city = "<DIV class=title><SPAN>(.*?) - (.*?)<\\/SPAN><\\/DIV>"; //匹配省名,市名 static String pattern_number = ">(13\\d{5}|15\\d{5}|18\\d{5}|147\\d{4})<"; //匹配号码段 static String pattern_pro ="\\w{3}\\.\\w{7}\\.\\w{3}\\/\\w{4}\\/\\w+"; //省份URL static String pattern_city_hz="<LI><A href=\"(.*?)\" target=_blank>"; //城市URL的后缀 //编译预处理相关选项 static String insertSQL = "insert ignore into number_segment(segment,province,city) values(?, ?, ?)"; static PreparedStatement pst = null; static int num_pro = 0; static int num_city=0; static int all_num_tele = 0; public static void main(String[] args) throws Exception { String PreStat = "insert ignore into number_segment(segment,province,city) values (?,?,?) "; pst = conn.prepareStatement(PreStat.toString()); Matcher mat_home = get(home_url,pattern_pro); long start = System.currentTimeMillis(); while(mat_home.find()) { num_pro++; System.out.println("------第"+num_pro+"个省-----"); String city_url_qz = "http://"+mat_home.group()+"/"; int len = city_url_qz.length(); //这里换成StringBuffer来最字符串进行相加处理 StringBuffer city_ur = new StringBuffer(); city_ur.append(city_url_qz); Matcher mat_city_hz = get(city_url_qz,pattern_city_hz); while(mat_city_hz.find()) //通过拼接获得 城市的完整url { num_city++; System.out.println("第"+num_city+"个市"); String last_city_url=city_ur.append(mat_city_hz.group(1)).toString(); //String last_city_url = city_url_qz + mat_city_hz.group(1); int len2 = last_city_url.length(); One_City_Tele_to_DB(last_city_url); city_ur.delete(len,len2); } } long end = System.currentTimeMillis(); long time = (end-start)/(1000*60); conn.close(); System.out.println("查询到的电话号码段总数量:"+all_num_tele); System.out.println("花费的时间是:"+time); } public static void One_City_Tele_to_DB(String url) throws Exception { int this_city_num=0; String pro = null; String city = null; Matcher mat_pro_city = get(url,pattern_pro_city); //获取省名字 市名字 while(mat_pro_city.find()) { String long_pro = mat_pro_city.group(1); pro = long_pro.substring(0, long_pro.length()-1); String long_city = mat_pro_city.group(2); city = long_city.substring(0, long_city.length()-10); System.out.println("省份:"+pro+" "+"城市:"+city+" 正在插入号码段进数据库"); } Matcher mat_number = get(url,pattern_number); //获取号码段 while(mat_number.find()) { pst.setString(1,mat_number.group(1)); pst.setString(2, pro); pst.setString(3, city); pst.addBatch(); this_city_num++; all_num_tele++; } pst.executeBatch(); //每次批量插入一个城市的号码段 pst.clearBatch(); System.out.println("该市插入的号码段的数量是:"+ this_city_num); } //正则匹配 public static Matcher get(String str_url, String pattern) throws Exception { String urlsource = get_Html(str_url); Pattern p = Pattern.compile(pattern); Matcher m = p.matcher(urlsource); return m; } //获取网页内容 public static String get_Html(String str_url) throws IOException{ URL url = new URL(str_url); String content=""; StringBuffer page = new StringBuffer(); try { BufferedReader in = new BufferedReader(new InputStreamReader(url .openStream())); while((content = in.readLine()) != null){ page.append(content); } } catch (IOException e) { e.printStackTrace(); } return page.toString(); } }
实际运行程序,发现有500多个重复的号码段,因为襄樊市 改成 襄阳市,这两个市的号码段全部一样,而数据库表中是以segment(号码)作为主键,所以要设置,当插入有相同主键的sql语句时,自动忽略跳过,方法就是在insert 后面加上ignore就可以了。
另外设置id为auto_increment,但如果把数据表里的数据清空之后,id不会从1重新开始,这时只要在mysql命令行下输入 truncate table table_name 就可以实现id从1开始了。
显然,6分钟的速度还是太慢了,后面试了几次都是在6~8分钟之内(不过相对于不用批处理而直接用statement已经快很多了)。因此还得想办法优化。
【使用智能批量处理+StringBuilder的append()方法】
在插入mysql数据库表中时,可以以insert ignore into number_segment(segment,province,city) values(?, ?, ?),vaulues(?),values(?,?,?)…的形式进行批量插入,上面使用setString()方法传参还是太慢了,直接用StringBuilder的append()方法吧,我们都知道appedn()方法进行字符串相加处理的效率是很高的,所以我们每次在insert语句后面加入一段“vaulues(?,?,?),”这样形式的字符串,然后批处理插入,这里通过变量all_tele_num进行批量处理的的控制,我们规定每次批量执行2000个数据段插入。
直接看代码吧
import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.sql.Connection; import java.sql.PreparedStatement; import java.util.regex.Matcher; import java.util.regex.Pattern; public class SecondCrawl { private static Connection conn = database.ConnectToDataBase(); //预编译 + StringBuilder static StringBuilder PreStat = new StringBuilder(); static String Qz = "insert ignore into number_segment(segment,province,city) values"; static String insertSQL = "insert ignore into number_segment(segment,province,city) values(?, ?, ?)"; static int len1 = Qz.length(); static PreparedStatement pst = null; static String home_url = "http://www.hiphop8.com"; static String pattern_pro_city = "<DIV class=title><SPAN>(.*?) - (.*?)<\\/SPAN><\\/DIV>"; //匹配省名,市名 static String pattern_number = ">(13\\d{5}|15\\d{5}|18\\d{5}|147\\d{4})<"; //匹配号码段 static String pattern_pro ="\\w{3}\\.\\w{7}\\.\\w{3}\\/\\w{4}\\/\\w+"; //省份URL static String pattern_city_hz="<LI><A href=\"(.*?)\" target=_blank>"; //城市URL的后缀 static int num_pro = 0; static int num_city=0; static int all_num_tele=0; public static void main(String[] args) throws Exception { Matcher mat_home = get(home_url,pattern_pro); conn.setAutoCommit(true); PreStat.append(Qz); pst = conn.prepareStatement(insertSQL); //预编译 long start = System.currentTimeMillis(); while(mat_home.find()) { num_pro++; System.out.println("------第"+num_pro+"个省-----"); String city_url_qz = "http://"+mat_home.group()+"/"; int len = city_url_qz.length(); StringBuffer city_ur = new StringBuffer(); city_ur.append(city_url_qz); Matcher mat_city_hz = get(city_url_qz,pattern_city_hz); while(mat_city_hz.find()) //获得城市的url { num_city++; System.out.println("第"+num_city+"个市"); String city_url=city_ur.append(mat_city_hz.group(1)).toString(); int len2 = city_url.length(); One_City_Tele_to_DB(city_url); city_ur.delete(len,len2); } } long end = System.currentTimeMillis(); long time = (end-start)/(1000*60); pst.executeBatch(); //批处理执行最后面剩余的部分 conn.close(); System.out.println("查询到的电话号码段总数量:"+all_num_tele); System.out.println("花费的时间是:"+time+"分多钟\n"+"以微秒为单位:"+(end-start)+"微秒"); } //一个城市的手机号码段处理函数 public static void One_City_Tele_to_DB(String url) throws Exception { String city=null; String pro =null; int this_city_num = 0; Matcher mat_pro_city = get(url,pattern_pro_city); while(mat_pro_city.find()) { String long_pro = mat_pro_city.group(1); pro = long_pro.substring(0, long_pro.length()-1); String long_city = mat_pro_city.group(2); city = long_city.substring(0, long_city.length()-10); System.out.println("省份:"+pro+" "+"城市:"+city+" 正在插入号码段进数据库..."); } String temp = ",'"+pro+"','"+city+"'),"; Matcher mat_number = get(url,pattern_number); while(mat_number.find()) { PreStat.append("("+mat_number.group(1)).append(temp); this_city_num++; all_num_tele++; if(all_num_tele<=208000 && all_num_tele % 2000==0) { PreStat.deleteCharAt(PreStat.length()-1); //除去sql语句后的逗号 pst.addBatch(PreStat.toString()); pst.executeBatch(); pst.clearBatch(); PreStat.delete(len1, PreStat.length()); //情况sql语句后面部分以释放空间 } } if(all_num_tele>208000) //后面不足2000部分的城市加入批处理中先不执行 { PreStat.deleteCharAt(PreStat.length()-1); pst.addBatch(PreStat.toString()); PreStat.delete(len1, PreStat.length()); } System.out.println("该市插入的号码段的数量是:"+ this_city_num); } //正则匹配 public static Matcher get(String str_url, String pattern) throws Exception { String urlsource = get_Html(str_url); Pattern p = Pattern.compile(pattern); Matcher m = p.matcher(urlsource); return m; } //获取网页内容 public static String get_Html(String str_url) throws IOException{ URL url = new URL(str_url); String content=""; StringBuffer page = new StringBuffer(); try { BufferedReader in = new BufferedReader(new InputStreamReader(url .openStream())); while((content = in.readLine()) != null){ page.append(content); } } catch (IOException e) { e.printStackTrace(); } return page.toString(); } }
测试了几次,运行时间是在2分钟左右,又提速了不少,不过还是有很大的提升空间的,因为自己在测试时,如果程序仅仅是插入20多万的sql语句,可以在几秒钟内完成。
说道再优化,自己的思路是把网站url采集和插入数据库使用多线程进行并发操作,现在正在学习java的多线程,也在尝试用多线程的方法写采集程序,如果大家还有更好的方法,也可以给我留言,愿意和大家一起交流进步。