java爬虫中jsoup的使用
jsoup可以用来解析HTML的内容,其功能非常强大,它可以向javascript那样直接从网页中提取有用的信息
例如1:
- 从html字符串中解析数据
//直接从字符串中获取 public static void getParByString() { String html = "<html><head><title> 这里是字符串内容</title></head"+ ">"+"<body><p class='p1'> 这里是 jsoup 作用的相关演示</p></body></html>"; Document doc = Jsoup.parse(html); Elements links = doc.select("p[class]"); for(Element link:links){ String linkclass = link.className(); String linkText = link.text(); System.out.println(linkText); System.out.println(linkclass); } }
- 从本地文件中解析数据
//从本地文件中获取 public static void getHrefByLocal() { File input = new File("C:\\Users\\Idea\\Desktop\\html\\Home.html"); Document doc = null; try { doc = Jsoup.parse(input,"UTF-8","http://www.oschina.net/"); //这里后面加了网址是为了解决后面绝对路径和相对路径的问题 } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } Elements links = doc.select("a[href]"); for(Element link:links){ String linkHref = link.attr("href"); String linkText = link.text(); System.out.println(linkText+":"+linkHref); } }
- 直接从网络上解析数据
public static HashMap getHrefByNet(String url) { HashMap hm = new HashMap(); String href = null; try { //这是get方式得到的 Document doc = Jsoup.connect(url).get(); String title = doc.title(); Elements links = doc.select("a[href]"); for(Element link:links){ String linkHref = link.attr("abs:href"); String linkText = link.text(); //System.out.println(linkText+":"+linkHref); hm.put(linkText, linkHref); href=linkText; } //System.out.println("***************"); //另外一种是post方式 /*@SuppressWarnings("unused") Document doc_Post = Jsoup.connect(url) .data("query","Java") .userAgent("I am jsoup") .cookie("auth","token") .timeout(10000) .post(); Elements links_Post = doc.select("a[href]"); for(Element link:links_Post){ String linkHref = link.attr("abs:href"); String linkText = link.text(); //System.out.println(linkText+":"+linkHref); //map.put(linkText, linkHref); }*/ } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); hm.put("加载失败", "error"); } return hm ; }
注意:需要引用的jar为以下:
import org.jsoup.*;
import org.jsoup.nodes.*;
import org.jsoup.select.Elements;
最后附上jar包下载地址:
http://jsoup.org/packages/jsoup-1.8.1.jar
具体实际项目请看java爬虫实战项目
循环遍历Hashtable中的键和值
/*创建一个测试的键值对*/ Hashtable h = new Hashtable(); /*往键值对中添加数据*/ h.put(key, value); /*然后依次循环取出hashtable中的键和值*/ Iterator it = h.entrySet().iterator(); while(it.hasNext()) { Map.Entry m = (Map.Entry)it.next(); System.out.println(m.getValue()); System.out.println(m.getKey()); }
java文件夹的创建(先判断是否存在,如果不存在就创建)
//创建文件夹(如果不存在就创建,存在就不变) public void makedir(){ //定义文件夹路径 String filePath = "D://home//Lucy"; File file = new File(filePath); if(!file.exists()&&!file.isDirectory()) { System.out.println("不存在"); file.mkdirs(); //创建文件夹 注意mkdirs()和mkdir()的区别 //判断是否创建成功 if(file.exists()&&file.isDirectory()) //文件夹存在并且是文件夹 { System.out.println("文件夹创建成功!"); } else{ System.out.println("文件创建不成功!"); } } else{ System.out.println("文件已经存在!"); } }
java文件的创建(先判断是否存在,如果不存在就创建)
//创建文件,如果不存在就创建文件 public void makeFile() { String fileName = "D://file2.txt"; File file = new File(fileName); if(!file.exists()&&!file.isFile()) { try { if(file.createNewFile()) //创建文件,返回布尔值,如果成功为true,否则为false { System.out.println("文件创建成功!"); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } else{ System.out.println("文件已经存在!"); } }
在文件中写入内容
//往文件中写入文本 public void writeText(String s) { String fileName = "D://file2.txt"; File file = new File(fileName); if(file.exists()&&file.isFile()) //如果文件存在,可以写入内容 { FileOutputStream fos = null; try { fos = new FileOutputStream(fileName); } catch (FileNotFoundException e2) { // TODO Auto-generated catch block e2.printStackTrace(); } try { fos.write(s.getBytes()); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } try { fos.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } else{ System.out.println("文件不存在,不能写入内容"); } }
java获取系统时间:
public static void getTime() { SimpleDateFormat f = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); Date date = new Date(); System.out.println(f.format(date)); System.out.println(new SimpleDateFormat("yyyy年MM月dd日 HH时mm分ss秒").format(date)); System.out.println(date); }
java连接mysql数据库
首先添加jar包:下载jar包
public class connectDoctorMySql { /* public static final String url = "jdbc:mysql://192.168.0.16/hive"; public static final String name = "com.mysql.jdbc.Driver"; public static final String user = "hive"; public static final String password = "hive"; public Connection conn = null; public PreparedStatement pst = null; public Statement stmt = null; ResultSet rs = null;*/ public static final String url = "jdbc:mysql://127.0.0.1/orcl?useUnicode=true&characterEncoding=utf-8&useSSL=false"; public static final String name = "com.mysql.jdbc.Driver"; public static final String user = "root"; public static final String password = "China123"; public Connection conn = null; public PreparedStatement pst = null; public Statement stmt = null; ResultSet rs = null; //初始化数据库 public void init(){ try { Class.forName(name);//指定连接类型 conn = DriverManager.getConnection(url, user, password);//获取连接 stmt = conn.createStatement(); } catch (Exception e) { System.out.println("数据库连接失败. . ."); e.printStackTrace(); } } //执行sql语句 public void excute(String sql){ init(); try { int result =stmt.executeUpdate(sql); } catch (SQLException e) { System.out.println("数据执行失败:"+sql);//打印sql语句 e.printStackTrace(); }finally{ try { if (rs!=null){ rs.close(); } if(pst!=null){ pst.close(); } if(conn!=null) { conn.close(); } }catch (SQLException e) { e.printStackTrace(); } } }
//查询语句 public ArrayList select(String sql,int x,int y){ init(); ArrayList result= new ArrayList(); try { ResultSet rs = stmt.executeQuery(sql); while(rs.next()) { String[] str = new String[2]; str[0]=rs.getString(x); str[1]=rs.getString(y); result.add(str); } } catch (SQLException e) { e.printStackTrace(); }finally{ try { if (rs!=null){ rs.close(); } if(pst!=null){ pst.close(); } if(conn!=null) { conn.close(); } }catch (SQLException e) { e.printStackTrace(); } } return result; }
java连接oracle数据库
public class connectDoctor { //连接oracl数据库 public static final String url = "jdbc:oracle:thin:@127.0.0.1:1521:orcl"; //@127.0.0.1 public static final String name = "oracle.jdbc.driver.OracleDriver"; public static final String user = "c238891"; public static final String password = "Rapid111"; public Connection conn = null; public PreparedStatement pst = null; public Statement stmt = null; ResultSet rs = null; //初始化数据库 public void init(){ try { Class.forName(name);//指定连接类型 conn = DriverManager.getConnection(url, user, password);//获取连接 stmt = conn.createStatement(); } catch (Exception e) { System.out.println("插入数据失败:"); e.printStackTrace(); } } //测试连接数据库 public void start() { init(); String sql = "select * from emp"; try { pst = conn.prepareStatement(sql); rs = pst.executeQuery(); while (rs.next()) { System.out.println("编号:" + rs.getString("empno") + ";姓名:" + rs.getString("ename") + "; 工作:" + rs.getString("job") + "; 领导:" + rs.getString("mgr") + "; 雇佣日期:" + rs.getString("hiredate") + "; 工资:" + rs.getString("sal") + "; 奖金:" + rs.getString("comm") + "; 部门:" + rs.getString("deptno")); } } catch (SQLException e) { e.printStackTrace(); }finally{ try { if (rs!=null){ rs.close(); if(pst!=null) { pst.close(); } if(conn!=null) { conn.close(); } } } catch (SQLException e) { e.printStackTrace(); } } } //执行sql语句 public void excute(String sql){ init(); try { int result =stmt.executeUpdate(sql); } catch (SQLException e) { System.out.println(sql); //System.out.println("错误"); e.printStackTrace(); }finally{ try { if (rs!=null){ rs.close(); } if(pst!=null){ pst.close(); } if(conn!=null) { conn.close(); } }catch (SQLException e) { e.printStackTrace(); } } } //查询语句 public ArrayList select(String sql,int x,int y){ init(); ArrayList result= new ArrayList(); try { ResultSet rs = stmt.executeQuery(sql); while(rs.next()) { String[] str = new String[2]; str[0]=rs.getString(x); str[1]=rs.getString(y); result.add(str); } } catch (SQLException e) { e.printStackTrace(); }finally{ try { if (rs!=null){ rs.close(); } if(pst!=null){ pst.close(); } if(conn!=null) { conn.close(); } }catch (SQLException e) { e.printStackTrace(); } } return result; }