Java 写的简单网络爬虫
觉得好玩,昨天就研究了一下java爬虫。
在网上搜索了一些样例研究了一下。仿造写了一个简单的爬虫,可以自动爬取某本小说的章节(需要自定义正则表达式),利用 多线程+锁 可以爬的更快,也可以同时爬多本书。
目前针对的是起点小说网的正则,利用set和list存需要爬的链接和已经爬过的链接,再用map存某本书的名字,已经爬取的章节数等等,然后写到文件里面。
两个类实现
AllUrl.java
import java.io.*; import java.net.URL; import java.net.URLConnection; import java.text.SimpleDateFormat; import java.util.*; public class AllUrl { public static int maxDepth = 100;//章节数 public static int maxThread = 3;//最大线程数 public static List<String> waitUrl = new ArrayList<>(); public static Set<String> overUrl = new HashSet<>(); public static Map<String,Integer> UrlDepth = new HashMap<>(); public static Map<String,String> bookName = new HashMap<>(); public static String savePath = "E:\\起点book\\"; public static synchronized void workUrl(String url,int depth){ if(depth > AllUrl.maxDepth){ System.out.println("《"+bookName.get(url)+"》爬取达到设定的章节数,停止爬取。"); SimpleDateFormat formatter= new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z"); Date date = new Date(System.currentTimeMillis()); System.out.println(formatter.format(date)); Thread.currentThread().interrupt(); }else{ if(AllUrl.overUrl.contains(url)){ System.out.println(url+"已经爬取过"); }else{ try{ URL url1 = new URL(url);//新链接 URLConnection urlConnection = url1.openConnection();//链接 新链接 InputStream is = urlConnection.getInputStream();//获取链接的内容 BufferedReader br = new BufferedReader(new InputStreamReader(is,"UTF-8"));//读取转换获取的内容到缓冲区 StringBuilder stringBuilder = new StringBuilder();//读取缓冲区内容 String tempString ; while((tempString = br.readLine()) != null){ stringBuilder.append(tempString);//追加字符到stringBuilder } //这里的正则表示式针对的是https://www.qidian.com/ 起点中文网 String sStart = "class=\"read-content j_readContent\"", tem = "<p>", Sin = ""; StringBuilder nextUrl = new StringBuilder("<a id=\"j_chapterNext\" href=\""); int start = stringBuilder.indexOf(sStart), end = stringBuilder.indexOf(tem, start + 1), AllEnd = stringBuilder.indexOf("<div class=\"admire-wrap\">"); int nextUrlStart = stringBuilder.indexOf(String.valueOf(nextUrl)); nextUrlStart += nextUrl.length(); int nextUrlEnd = stringBuilder.indexOf("\"", nextUrlStart + 1); nextUrl.setLength(0);//清空 nextUrl.append("https:");//加前缀 nextUrl.append(stringBuilder.substring(nextUrlStart, nextUrlEnd));//加后缀 addUrl(nextUrl.toString(),depth+1,bookName.get(url)); start += sStart.length() + 20; AllEnd -= 10;//定义文章开始结束位置 PrintWriter pw = new PrintWriter(new File(savePath + bookName.get(url)+"第"+depth +"章"+ ".txt"));//建立存放数据的文件,文件名包含当前时间,防止重复 String sin = ""; while ((Sin = stringBuilder.substring(start, end)) != null) { pw.println(Sin);//写入文件 start = end + tem.length(); end = stringBuilder.indexOf(tem, start + 1); if (end == -1 || end >= AllEnd) { break; } } //这里的正则表示式针对的是https://www.qidian.com/ 起点中文网 pw.close(); br.close(); overUrl.add(url); System.out.println("《"+bookName.get(url)+"》已爬取,共爬取所有小说章节数量" + overUrl.size() + "剩余爬取章节数量:" + waitUrl.size()); }catch (Exception e){ e.printStackTrace(); } } } } public static synchronized String getUrl(){//取得最新Url if(!waitUrl.isEmpty()){ String tempUrl = waitUrl.get(0); waitUrl.remove(0); return tempUrl; }else return null; } public static synchronized void addUrl(String Url,int Depth,String bName){//添加Url if(overUrl.contains(Url) == false){ waitUrl.add(Url); UrlDepth.put(Url,Depth); bookName.put(Url,bName); System.out.println("《"+bookName.get(Url)+"》的章节"+Depth+"已经添加到待爬取队列,目前待爬取队列有"+waitUrl.size()+"个任务。"); }else{ System.out.println("《"+bookName.get(Url)+"》的章节"+Depth+"已经爬取过了,不再爬取。"); } } }
CrawlTheWeb.java
import java.sql.Time; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.concurrent.Executor; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; public class CrawlTheWeb extends Thread{ public static Object lock = new Object();//线程锁 用于使线程进入睡眠,或随机唤醒一个线程 public void run(){ while(true){ if(Thread.currentThread().isInterrupted()){ System.out.println(Thread.currentThread().getName()+"完成了任务。"); break; } if(AllUrl.waitUrl.isEmpty() == false){ String nextUrl = AllUrl.waitUrl.get(0); AllUrl.waitUrl.remove(0); AllUrl.workUrl(nextUrl,AllUrl.UrlDepth.get(nextUrl)); System.out.println(this.getName()+"开始爬取《"+AllUrl.bookName.get(nextUrl)+"》,章节数: "+AllUrl.UrlDepth.get(nextUrl)); }else{ synchronized (lock){ try { System.out.println("待爬取列表为空,"+this.getName()+"进入等待状态。"); lock.wait(); }catch (Exception e){ e.printStackTrace(); } } } } } public static void main(String[] args) { SimpleDateFormat formatter= new SimpleDateFormat("yyyy-MM-dd 'at' HH:mm:ss z"); Date date = new Date(System.currentTimeMillis()); System.out.println(formatter.format(date)); String strUrl = "https://read.qidian.com/chapter/D-1F0Iq1JGPOVUeyz9PqUQ2/DIfEaAmW-9X6ItTi_ILQ7A2/";//爬取的网页 AllUrl.addUrl(strUrl,1,"模拟器:开局天牢死囚"); strUrl = "https://read.qidian.com/chapter/W08HMrSPUHj7X4qr8VpWrA2/8W_pmmniqFvM5j8_3RRvhw2/";//爬取的网页 AllUrl.addUrl(strUrl,1,"我的属性修行人生"); strUrl = "https://read.qidian.com/chapter/q2B9dFLoeqU3v1oFI-DX8Q2/dsXQ94IHlUZp4rPq4Fd4KQ2/";//爬取的网页 AllUrl.addUrl(strUrl,1,"这个武圣超有素质"); for(int i=0;i<AllUrl.maxThread;i++){ new CrawlTheWeb().start(); } } }
大多数人都推荐 实现 Runnable ,但是目前我还用不着,暂时先继承稍微熟悉一点的Thread。