利用jsoup爬虫工具,爬取数据,并利用excel导出
import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.List; import net.sf.json.JSONArray; import net.sf.json.JSONObject; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFCellStyle; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class aaaa { public static void main(String[] args) { aaaa t = new aaaa(); t.getExl(); } //得到URL数据 public List<String> getText() throws IOException{ List<String>Url=new ArrayList<String>(); //批量导入网址 String file = "shop.txt"; BufferedReader in = new BufferedReader( new InputStreamReader( new BufferedInputStream( new FileInputStream(file)), "gbk")); String str; while((str= in.readLine())!=null){ Url.add(str); System.out.println(str); } in.close(); return Url; } //将抓取的数据组合成json对象 public List<JSONObject> parseUrl() { List<JSONObject>UTL=new ArrayList<JSONObject>(); try { for(String url:getText()){ //将url这个网页解析成一个dom对象 Document doc = Jsoup.connect(url).get(); //筛选<a class="mall-icon">的所有对象 Elements hrefs = doc.select("a.mall-icon"); //遍历改对象数组 for(Element href:hrefs){ //对该对象的元素进行处理,包装成另一个url进行二级网站的数据抓取 String number=href.attr("data-uid"); String jsonurl="http://rate.taobao.com/ShopService4C.htm?userNumId="+number; System.out.println(jsonurl); Document doc1 = Jsoup.connect(jsonurl).get(); Element jsons=doc1.body(); JSONObject json=JSONObject.fromObject(jsons.toString().replaceAll("<body>|</body>", "").replace(""", "")); UTL.add(json); } } } catch (IOException e) { e.printStackTrace(); } return UTL; } //导出excel表格 public void getExl(){ try { // 第一步,创建一个webbook,对应一个Excel文件 HSSFWorkbook wb = new HSSFWorkbook(); // 第二步,在webbook中添加一个sheet,对应Excel文件中的sheet HSSFSheet sheet = wb.createSheet("行业统计"); // 第三步,在sheet中添加表头第0行,注意老版本poi对Excel的行数列数有限制short HSSFRow row = sheet.createRow((int) 0); // 第四步,创建单元格,并设置值表头 设置表头居中 HSSFCellStyle style = wb.createCellStyle(); style.setAlignment(HSSFCellStyle.ALIGN_CENTER); // 创建一个居中格式 HSSFCell cell = row.createCell((short) 0); cell.setCellValue("URL"); cell.setCellStyle(style); cell = row.createCell((short) 1); cell.setCellValue("行业均值"); cell.setCellStyle(style); cell = row.createCell((short) 2); cell.setCellValue("本店值"); cell.setCellStyle(style); List<JSONObject>list=parseUrl(); List<String>url=getText(); System.out.println(list.size()); System.out.println(url.size()); for (int i = 0; i < list.size(); i++) { row = sheet.createRow((int) i + 1); JSONObject json=(JSONObject)list.get(i); JSONObject ratRefund=json.optJSONObject("ratRefund"); // 第四步,创建单元格,并设置值 System.out.println(i); row.createCell((short) 0).setCellValue("淘宝淘宝"); row.createCell((short) 1).setCellValue(ratRefund.optString("indVal")); row.createCell((short) 2).setCellValue(ratRefund.optString("localVal")); } FileOutputStream fout = new FileOutputStream("E:/taobao.xls"); wb.write(fout); fout.close(); } catch (Exception e) { e.printStackTrace(); } } }