java多线程爬取笔趣阁所有小说(请准备够大的硬盘)
可以选择下载的数量,全部下载下来够呛,首先没那么大的盘
新版本:https://wws.lanzous.com/iAEMoghsgeb 密码:7vjz
jar包:https://wws.lanzous.com/ilphyghsgcj密码:f38a
先导包
<dependency> <!-- jsoup HTML parser library [url=home.php?mod=space&uid=402414]@[/url] https://jsoup.org/ --> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.2</version> </dependency>
核心代码展示;
package com.aaa.data; import com.aaa.config.SSLHelper; import com.aaa.dto.BookCatalogueDto; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.select.Elements; import java.io.*; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @author 三木猿 * @version 1.0 * @Title: * @date 2020/8/10 15:16 */ public class DownloadBook { private static String dataSource; private static Pattern pattern = Pattern.compile("<a\\s*href=\"?([\\w\\W]*?)\"?[\\s]*?[^>]>([\\s\\S]*?)(?=</a>)"); public static void setDataSource(String dataSource,int count) { SSLHelper.init(); DownloadBook.dataSource = dataSource; if ("biquge5200".equals(dataSource)) { while (true) { Thread thread1 = new Thread(() -> { for (int i = 1; i <(count>=2?count/2:count); i++) { try { String bookCod = "0_" + i; Document document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/").get(); Element info = document.getElementById("info"); String bookName = info.select("h1").text(); String path = "/usr/local/webapps/file/" + bookName + ".txt"; File file = new File(path); if (file.exists()) { continue; } System.out.println("---------------" + bookName + "正在下载" + "--------------"); List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern); downloadBook(bookCod, bookName, bookCatalogue); System.out.println("---------------" + bookName + "下载完成" + "--------------"); } catch (Exception e) { return; } } }); assert count>=2; Thread thread2 = new Thread(() -> { for (int i = count/2; i < count; i++) { try { i++; String bookCod = "0_" + i; Document document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/").get(); Element info = document.getElementById("info"); String bookName = info.select("h1").text(); String path = "/usr/local/webapps/file/" + bookName + ".txt"; File file = new File(path); if (file.exists()) { continue; } System.out.println("---------------" + bookName + "正在下载" + "--------------"); List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern); downloadBook(bookCod, bookName, bookCatalogue); System.out.println("---------------" + bookName + "下载完成" + "--------------"); } catch (Exception e) { return; } } }); thread1.start(); thread2.start(); try { thread1.join(); thread2.join(); break; } catch (InterruptedException e) { e.printStackTrace(); } } } else if ("biquge".equals(dataSource)) { while (true) { Thread thread1 = new Thread(() -> { for (int j = 1; j < count/2; j++) { try { String bookCod = "0_" + j; Document document = Jsoup.connect("https://www.biquge.com/" + bookCod + "/").get(); Element info = document.getElementById("info"); String bookName = info.select("h1").text(); String path = "/usr/local/webapps/file/" + bookName + ".txt"; File file = new File(path); if (file.exists()) { continue; } List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern); System.out.println("---------------" + bookName + "正在下载" + "--------------"); downloadBook(bookCod, bookName, bookCatalogue); System.out.println("---------------" + bookName + "下载完成" + "--------------"); } catch (Exception e) { continue; } } }); Thread thread2 = new Thread(() -> { for (int j = count/2; j < count; j++) { try { String bookCod = "0_" + j; Document document = Jsoup.connect("https://www.biquge.com/" + bookCod + "/").get(); Element info = document.getElementById("info"); String bookName = info.select("h1").text(); String path = "/usr/local/webapps/file/" + bookName + ".txt"; File file = new File(path); if (file.exists()) { continue; } List<BookCatalogueDto> bookCatalogue = getBookCatalogue(bookCod, document, pattern); System.out.println("---------------" + bookName + "正在下载" + "--------------"); downloadBook(bookCod, bookName, bookCatalogue); System.out.println("---------------" + bookName + "下载完成" + "--------------"); } catch (Exception e) { continue; } } }); thread1.start(); thread2.start(); try { thread1.join(); thread2.join(); break; } catch (InterruptedException e) { e.printStackTrace(); } } } } public static void downloadBook(String bookCod, String bookName, List<BookCatalogueDto> bookCatalogueDto) throws Exception { String path = "/usr/local/webapps/file/" + bookName + ".txt"; File file = new File(path); if (file.exists()) { return; } Map<Integer, List<BookCatalogueDto>> integerListMap = splitList(bookCatalogueDto, 3); long start = System.currentTimeMillis(); Thread thread1 = new Thread(() -> { try { if ("biquge5200".equals(dataSource)) { biquge5200(bookCod, bookName + "1", integerListMap.get(0)); } else if ("biquge".equals(dataSource)) { biquge(bookCod, bookName + "1", integerListMap.get(0)); } } catch (Exception e) { e.printStackTrace(); } }); Thread thread2 = new Thread(() -> { try { if ("biquge5200".equals(dataSource)) { biquge5200(bookCod, bookName + "2", integerListMap.get(1)); } else if ("biquge".equals(dataSource)) { biquge(bookCod, bookName + "2", integerListMap.get(1)); } } catch (Exception e) { e.printStackTrace(); } }); Thread thread3 = new Thread(() -> { try { if ("biquge5200".equals(dataSource)) { biquge5200(bookCod, bookName + "3", integerListMap.get(2)); } else if ("biquge".equals(dataSource)) { biquge(bookCod, bookName + "3", integerListMap.get(2)); } } catch (Exception e) { e.printStackTrace(); } }); thread1.start(); thread2.start(); thread3.start(); thread1.join(); thread2.join(); thread3.join(); //合并文件 combine(bookName); long end = System.currentTimeMillis(); System.out.println("本次下载共用时" + (end - start)); } public static void biquge5200(String bookCod, String bookName, List<BookCatalogueDto> bookCatalogueDto) throws Exception { String path = "/usr/local/webapps/file/downloading/" + bookName + ".txt"; File file = new File(path); if (!file.exists()) { File dir = new File(file.getParent()); dir.mkdirs(); try { file.createNewFile(); } catch (IOException e) { e.printStackTrace(); } } else { List<BookCatalogueDto> bookCatalogueDtos = txtCatalogue(bookName); if (bookCatalogueDtos.size() != 0) { BookCatalogueDto bookCatalogueDto1 = bookCatalogueDtos.get(bookCatalogueDtos.size() - 1); for (BookCatalogueDto catalogueDto : bookCatalogueDto) { if (catalogueDto.getCatalogueName().equals(bookCatalogueDto1.getCatalogueName())) { int i = bookCatalogueDto.indexOf(catalogueDto); bookCatalogueDto = bookCatalogueDto.subList(i + 1, bookCatalogueDto.size()); break; } } } } //创建一个输出流,将爬到的小说以txt形式保存在硬盘 BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true))); if(bookCatalogueDto.size()==0){ return; } bookCatalogueDto.forEach(e -> { Document document = null; try { document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/" + e.getCatalogueCod() + ".html").get(); } catch (IOException ioException) { try { Thread.sleep(5000); try { document = Jsoup.connect("https://www.biquge5200.com/" + bookCod + "/" + e.getCatalogueCod() + ".html").get(); } catch (IOException exception) { return; } } catch (InterruptedException interruptedException) { interruptedException.printStackTrace(); } } Elements chapterName = document.select("h1"); try { bw.write(chapterName.text()); bw.newLine(); bw.flush(); } catch (IOException ioException) { ioException.printStackTrace(); } Elements elements = document.select("#content"); String html = elements.get(0).html().replace("<div id='content'>", "").replace("</div>", ""); String replace = html.replace("<script>readx();</script>", "").replace("<script>chaptererror();</script>", ""); try { String[] split = replace.replace("<p>", "").split("</p>"); for (String s : split) { bw.write(s); bw.newLine(); bw.flush(); } } catch (IOException ioException) { ioException.printStackTrace(); } }); try { bw.close(); } catch (IOException e) { e.printStackTrace(); } } public static List<BookCatalogueDto> getBookCatalogue(String bookCod, Document document, Pattern pattern) throws InterruptedException { List<BookCatalogueDto> bookCatalogueDtos = new ArrayList<>(); Elements dd = document.getElementsByTag("dd"); Map<Integer, List<Element>> integerListMap = splitList(dd, 3); final List<BookCatalogueDto>[] bookCatalogueDtos1 = new List[]{new ArrayList<>()}; final List<BookCatalogueDto>[] bookCatalogueDtos2 = new List[]{new ArrayList<>()}; final List<BookCatalogueDto>[] bookCatalogueDtos3 = new List[]{new ArrayList<>()}; Thread thread1 = new Thread(() -> { bookCatalogueDtos1[0] = get(integerListMap.get(0), bookCod, document, pattern); }); Thread thread2 = new Thread(() -> { bookCatalogueDtos2[0] = get(integerListMap.get(1), bookCod, document, pattern); }); Thread thread3 = new Thread(() -> { bookCatalogueDtos3[0] = get(integerListMap.get(2), bookCod, document, pattern); }); thread1.start(); thread2.start(); thread3.start(); thread1.join(); thread2.join(); thread3.join(); bookCatalogueDtos.addAll(bookCatalogueDtos1[0]); bookCatalogueDtos.addAll(bookCatalogueDtos2[0]); bookCatalogueDtos.addAll(bookCatalogueDtos3[0]); return bookCatalogueDtos; } public static List<BookCatalogueDto> get(List<Element> dd, String bookCod, Document document, Pattern pattern) { List<BookCatalogueDto> bookCatalogueDtos = new ArrayList<>(); for (int i = 0; i < dd.size(); i++) { Element element = dd.get(i); BookCatalogueDto bookCatalogueDto = new BookCatalogueDto(); Node node = element.childNode(0); for (Node e : element.childNodes()) { if (!"".equals(e.toString())) { node = e; } } String s1 = node.toString(); Matcher matcher = pattern.matcher(s1); if (matcher.find()) { String nameCodeUrl = matcher.group(1); String insStr = nameCodeUrl.substring(nameCodeUrl.lastIndexOf("/") + 1, nameCodeUrl.lastIndexOf(".")); bookCatalogueDto.setCatalogueCod(Integer.parseInt(insStr)); } bookCatalogueDto.setBookCod(bookCod); bookCatalogueDto.setCatalogueName(element.text()); bookCatalogueDtos.add(bookCatalogueDto); } return bookCatalogueDtos; } private static void biquge(String bookCod, String bookName, List<BookCatalogueDto> bookCatalogueDto) throws FileNotFoundException { String path = "/usr/local/webapps/file/downloading/" + bookName + ".txt"; File file = new File(path); if (!file.exists()) { File dir = new File(file.getParent()); dir.mkdirs(); try { file.createNewFile(); } catch (IOException e) { e.printStackTrace(); } } else { List<BookCatalogueDto> bookCatalogueDtos = txtCatalogue(bookName); if (bookCatalogueDtos.size() != 0) { BookCatalogueDto bookCatalogueDto1 = bookCatalogueDtos.get(bookCatalogueDtos.size() - 1); for (BookCatalogueDto catalogueDto : bookCatalogueDto) { if (catalogueDto.getCatalogueName().equals(bookCatalogueDto1.getCatalogueName())) { int i = bookCatalogueDto.indexOf(catalogueDto); bookCatalogueDto = bookCatalogueDto.subList(i + 1, bookCatalogueDto.size()); break; } } } } //创建一个输出流,将爬到的小说以txt形式保存在硬盘 BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true))); if(bookCatalogueDto.size()==0){ return; } bookCatalogueDto.forEach(e -> { Document document = null; try { document = Jsoup.connect("https://www.biquge.com/" + e.getBookCod() + "/" + e.getCatalogueCod() + ".html").get(); } catch (Exception e1) { try { Thread.sleep(5000); document = Jsoup.connect("https://www.biquge.com/" + e.getBookCod() + "/" + e.getCatalogueCod() + ".html").get(); } catch (InterruptedException interruptedException) { interruptedException.printStackTrace(); } catch (Exception exception) { exception.printStackTrace(); } } Elements chapterName = document.select("h1"); try { bw.write(chapterName.text()); bw.newLine(); bw.flush(); } catch (IOException ioException) { ioException.printStackTrace(); } Elements elements = document.select("#content"); String html = elements.get(0).html().replace("<div id='content'>", "").replace("</div>", ""); String replace = html.replace("<script>readx();</script>", "").replace("<script>chaptererror();</script>", ""); try { String[] split = replace.split("<br>"); for (String s : split) { bw.write(s); bw.newLine(); bw.flush(); } } catch (IOException ioException) { ioException.printStackTrace(); } }); try { bw.close(); } catch (IOException e) { e.printStackTrace(); } } public static <T> Map<Integer, List<T>> splitList(List<T> t, int num) { Map<Integer, List<T>> subList = new HashMap<>(); int num1 = (int) Math.floor(t.size() / num); for (int i = 0; i < num; i++) { subList.put(i, t.subList(i * num1, (i + 1) * num1)); if (i == num - 1) { subList.put(i, t.subList(i * num1, t.size())); } } return subList; } public static void combine(String bookName) throws Exception { String bookPath = "/usr/local/webapps/file/" + bookName + ".txt"; File file = new File(bookPath); BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true))); for (int i = 1; i < 4; i++) { String path = "/usr/local/webapps/file/downloading/" + bookName + i + ".txt"; File file1 = new File(path); if (file1.exists()) { BufferedReader br = new BufferedReader(new FileReader(file1)); String line; while (true) { if (!((line = br.readLine()) != null)) { br.close(); break; } bw.write(line); bw.newLine(); } } file1.delete(); } bw.flush(); bw.close(); } public static List<BookCatalogueDto> txtCatalogue(String bookName) { List<BookCatalogueDto> bookCatalogueDtos = new ArrayList<>(); String fileNamedirs = "/usr/local/webapps/file/downloading/" + bookName + ".txt"; try { // 编码格式 String encoding = "utf-8"; // 文件路径 File file = new File(fileNamedirs); if (file.isFile() && file.exists()) { // 判断文件是否存在 // 输入流 InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);// 考虑到编码格 BufferedReader bufferedReader = new BufferedReader(read); String lineTxt = null; Long count = (long) 0; boolean bflag = false; int n = 0; String newStr = null; String titleName = null; String newChapterName = null;//新章节名称 String substring = null; int indexOf = 0; int indexOf1 = 0; int line = 0; //小说内容类 BookCatalogueDto content; while ((lineTxt = bufferedReader.readLine()) != null) { content = new BookCatalogueDto(); //小说名称 content.setBookName(bookName); count++; // 正则表达式 Pattern p = Pattern.compile("(^\\s*第)(.{1,9})[章节卷集部篇回](\\s{1})(.*)($\\s*)"); Matcher matcher = p.matcher(lineTxt); newStr = newStr + lineTxt; while (matcher.find()) { titleName = matcher.group(); //章节去空 newChapterName = titleName.trim(); //获取章节 //System.out.println(newChapterName); content.setCatalogueName(newChapterName); indexOf1 = indexOf; //System.out.println(indexOf); indexOf = newStr.indexOf(newChapterName); // System.out.println(newChapterName + ":" + "第" + count + "行"); // 得到返回的章 if (bflag) { bflag = false; break; } if (n == 0) { indexOf1 = newStr.indexOf(newChapterName); } n = 1; bflag = true; //System.out.println(chapter); bookCatalogueDtos.add(content); } } bufferedReader.close(); } else { System.out.println("找不到指定的文件"); } } catch (Exception e) { System.out.println("读取文件内容出错"); e.printStackTrace(); } return bookCatalogueDtos; } }